From 10634530f7ba947d8eab52a580e0840778d4ef75 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:39 +1100 Subject: xfs: convert kmem_zalloc() to kzalloc() There's no reason to keep the kmem_zalloc() around anymore, it's just a thin wrapper around kmalloc(), so lets get rid of it. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/kmem.h | 7 ------- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_attr_leaf.c | 3 ++- fs/xfs/libxfs/xfs_btree_staging.c | 2 +- fs/xfs/libxfs/xfs_da_btree.c | 5 +++-- fs/xfs/libxfs/xfs_defer.c | 2 +- fs/xfs/libxfs/xfs_dir2.c | 18 +++++++++--------- fs/xfs/libxfs/xfs_iext_tree.c | 12 ++++++++---- fs/xfs/xfs_attr_item.c | 4 ++-- fs/xfs/xfs_buf.c | 6 +++--- fs/xfs/xfs_buf_item.c | 4 ++-- fs/xfs/xfs_error.c | 4 ++-- fs/xfs/xfs_extent_busy.c | 3 ++- fs/xfs/xfs_itable.c | 8 ++++---- fs/xfs/xfs_iwalk.c | 3 ++- fs/xfs/xfs_log.c | 5 +++-- fs/xfs/xfs_log_cil.c | 4 ++-- fs/xfs/xfs_log_recover.c | 10 +++++----- fs/xfs/xfs_mru_cache.c | 7 ++++--- fs/xfs/xfs_qm.c | 3 ++- fs/xfs/xfs_refcount_item.c | 4 ++-- fs/xfs/xfs_rmap_item.c | 3 ++- fs/xfs/xfs_trans_ail.c | 3 ++- 23 files changed, 64 insertions(+), 58 deletions(-) diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index b987dc2c6851..bce31182c9e8 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -62,13 +62,6 @@ static inline void kmem_free(const void *ptr) kvfree(ptr); } - -static inline void * -kmem_zalloc(size_t size, xfs_km_flags_t flags) -{ - return kmem_alloc(size, flags | KM_ZERO); -} - /* * Zone interfaces */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 39d9525270b7..96a6bfd58931 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -381,7 +381,7 @@ xfs_initialize_perag( continue; } - pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!pag) { error = -ENOMEM; goto out_unwind_new_pags; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 6374bf107242..ab4223bf51ee 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2250,7 +2250,8 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0); + tmp_leaf = kzalloc(state->args->geo->blksize, + GFP_KERNEL | __GFP_NOFAIL); /* * Copy the header into the temp leaf so that all the stuff diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index e276eba87cb1..eff29425fd76 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -406,7 +406,7 @@ xfs_btree_bload_prep_block( /* Allocate a new incore btree root block. */ new_size = bbl->iroot_size(cur, level, nr_this_block, priv); - ifp->if_broot = kmem_zalloc(new_size, 0); + ifp->if_broot = kzalloc(new_size, GFP_KERNEL); ifp->if_broot_bytes = (int)new_size; /* Initialize it and send it out. */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 5457188bb4de..73aae6543906 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2518,7 +2518,7 @@ xfs_dabuf_map( int error = 0, nirecs, i; if (nfsb > 1) - irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS); + irecs = kzalloc(sizeof(irec) * nfsb, GFP_NOFS | __GFP_NOFAIL); nirecs = nfsb; error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs, @@ -2531,7 +2531,8 @@ xfs_dabuf_map( * larger one that needs to be free by the caller. */ if (nirecs > 1) { - map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS); + map = kzalloc(nirecs * sizeof(struct xfs_buf_map), + GFP_NOFS | __GFP_NOFAIL); if (!map) { error = -ENOMEM; goto out_free_irecs; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 66a17910d021..07d318b1f807 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -979,7 +979,7 @@ xfs_defer_ops_capture( return ERR_PTR(error); /* Create an object to capture the defer ops. */ - dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + dfc = kzalloc(sizeof(*dfc), GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index a76673281514..54915a302e96 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -104,10 +104,10 @@ xfs_da_mount( ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE); - mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_MAYFAIL); - mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), - KM_MAYFAIL); + mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); + mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { kmem_free(mp->m_dir_geo); kmem_free(mp->m_attr_geo); @@ -236,7 +236,7 @@ xfs_dir_init( if (error) return error; - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -273,7 +273,7 @@ xfs_dir_createname( XFS_STATS_INC(dp->i_mount, xs_dir_create); } - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -372,7 +372,7 @@ xfs_dir_lookup( * lockdep Doing this avoids having to add a bunch of lockdep class * annotations into the reclaim path for the ilock. */ - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; @@ -441,7 +441,7 @@ xfs_dir_removename( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -502,7 +502,7 @@ xfs_dir_replace( if (rval) return rval; - args = kmem_zalloc(sizeof(*args), KM_NOFS); + args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); if (!args) return -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index f4e6b200cdf8..4522f3c7a23f 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -398,7 +398,8 @@ static void xfs_iext_grow( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_node *node = kzalloc(NODE_SIZE, + GFP_NOFS | __GFP_NOFAIL); int i; if (ifp->if_height == 1) { @@ -454,7 +455,8 @@ xfs_iext_split_node( int *nr_entries) { struct xfs_iext_node *node = *nodep; - struct xfs_iext_node *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_node *new = kzalloc(NODE_SIZE, + GFP_NOFS | __GFP_NOFAIL); const int nr_move = KEYS_PER_NODE / 2; int nr_keep = nr_move + (KEYS_PER_NODE & 1); int i = 0; @@ -542,7 +544,8 @@ xfs_iext_split_leaf( int *nr_entries) { struct xfs_iext_leaf *leaf = cur->leaf; - struct xfs_iext_leaf *new = kmem_zalloc(NODE_SIZE, KM_NOFS); + struct xfs_iext_leaf *new = kzalloc(NODE_SIZE, + GFP_NOFS | __GFP_NOFAIL); const int nr_move = RECS_PER_LEAF / 2; int nr_keep = nr_move + (RECS_PER_LEAF & 1); int i; @@ -583,7 +586,8 @@ xfs_iext_alloc_root( { ASSERT(ifp->if_bytes == 0); - ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_data = kzalloc(sizeof(struct xfs_iext_rec), + GFP_NOFS | __GFP_NOFAIL); ifp->if_height = 1; /* now that we have a node step into it */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 9e02111bd890..2e454a0d6f19 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -512,8 +512,8 @@ xfs_attri_recover_work( if (error) return ERR_PTR(error); - attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + - sizeof(struct xfs_da_args), KM_NOFS); + attr = kzalloc(sizeof(struct xfs_attr_intent) + + sizeof(struct xfs_da_args), GFP_NOFS | __GFP_NOFAIL); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8e5bd50d29fe..5672665150ea 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -189,8 +189,8 @@ xfs_buf_get_maps( return 0; } - bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), - KM_NOFS); + bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), + GFP_NOFS | __GFP_NOFAIL); if (!bp->b_maps) return -ENOMEM; return 0; @@ -2002,7 +2002,7 @@ xfs_alloc_buftarg( #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; #endif - btp = kmem_zalloc(sizeof(*btp), KM_NOFS); + btp = kzalloc(sizeof(*btp), GFP_NOFS | __GFP_NOFAIL); btp->bt_mount = mp; btp->bt_bdev_handle = bdev_handle; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 023d4e0385dd..ec93d34188c8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -805,8 +805,8 @@ xfs_buf_item_get_format( return; } - bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), - 0); + bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format), + GFP_KERNEL | __GFP_NOFAIL); } STATIC void diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index b2cbbba3e15a..456520d60cd0 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -240,8 +240,8 @@ xfs_errortag_init( { int ret; - mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, - KM_MAYFAIL); + mp->m_errortag = kzalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_errortag) return -ENOMEM; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 2ccde32c9a9e..b90c3dd43e03 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -32,7 +32,8 @@ xfs_extent_busy_insert_list( struct rb_node **rbp; struct rb_node *parent = NULL; - new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0); + new = kzalloc(sizeof(struct xfs_extent_busy), + GFP_KERNEL | __GFP_NOFAIL); new->agno = pag->pag_agno; new->bno = bno; new->length = len; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 14462614fcc8..14211174267a 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -197,8 +197,8 @@ xfs_bulkstat_one( ASSERT(breq->icount == 1); - bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_MAYFAIL); + bc.buf = kzalloc(sizeof(struct xfs_bulkstat), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!bc.buf) return -ENOMEM; @@ -289,8 +289,8 @@ xfs_bulkstat( if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; - bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat), - KM_MAYFAIL); + bc.buf = kzalloc(sizeof(struct xfs_bulkstat), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!bc.buf) return -ENOMEM; diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index b3275e8d47b6..8dbb7c054b28 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -663,7 +663,8 @@ xfs_iwalk_threaded( if (xfs_pwork_ctl_want_abort(&pctl)) break; - iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); + iwag = kzalloc(sizeof(struct xfs_iwalk_ag), + GFP_KERNEL | __GFP_NOFAIL); iwag->mp = mp; /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a1650fc81382..d38cfaadc726 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1528,7 +1528,7 @@ xlog_alloc_log( int error = -ENOMEM; uint log2_size = 0; - log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); + log = kzalloc(sizeof(struct xlog), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!log) { xfs_warn(mp, "Log allocation failed: No memory!"); goto out; @@ -1605,7 +1605,8 @@ xlog_alloc_log( size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); - iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL); + iclog = kzalloc(sizeof(*iclog) + bvec_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog) goto out_free_iclog; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 67a99d94701e..3c705f22b0ab 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -100,7 +100,7 @@ xlog_cil_ctx_alloc(void) { struct xfs_cil_ctx *ctx; - ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); + ctx = kzalloc(sizeof(*ctx), GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); @@ -1747,7 +1747,7 @@ xlog_cil_init( struct xlog_cil_pcp *cilpcp; int cpu; - cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); + cil = kzalloc(sizeof(*cil), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!cil) return -ENOMEM; /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1251c81e55f9..4a27ecdbb546 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2057,7 +2057,8 @@ xlog_recover_add_item( { struct xlog_recover_item *item; - item = kmem_zalloc(sizeof(struct xlog_recover_item), 0); + item = kzalloc(sizeof(struct xlog_recover_item), + GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&item->ri_list); list_add_tail(&item->ri_list, head); } @@ -2187,9 +2188,8 @@ xlog_recover_add_to_trans( } item->ri_total = in_f->ilf_size; - item->ri_buf = - kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - 0); + item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t), + GFP_KERNEL | __GFP_NOFAIL); } if (item->ri_total <= item->ri_cnt) { @@ -2332,7 +2332,7 @@ xlog_recover_ophdr_to_trans( * This is a new transaction so allocate a new recovery container to * hold the recovery ops that will follow. */ - trans = kmem_zalloc(sizeof(struct xlog_recover), 0); + trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL); trans->r_log_tid = tid; trans->r_lsn = be64_to_cpu(rhead->h_lsn); INIT_LIST_HEAD(&trans->r_itemq); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index f85e3b07ab44..feae3115617b 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -333,13 +333,14 @@ xfs_mru_cache_create( if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) return -EINVAL; - if (!(mru = kmem_zalloc(sizeof(*mru), 0))) + mru = kzalloc(sizeof(*mru), GFP_KERNEL | __GFP_NOFAIL); + if (!mru) return -ENOMEM; /* An extra list is needed to avoid reaping up to a grp_time early. */ mru->grp_count = grp_count + 1; - mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0); - + mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists), + GFP_KERNEL | __GFP_NOFAIL); if (!mru->lists) { err = -ENOMEM; goto exit; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 67d0a8564ff3..9c2528418b18 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -628,7 +628,8 @@ xfs_qm_init_quotainfo( ASSERT(XFS_IS_QUOTA_ON(mp)); - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0); + qinf = mp->m_quotainfo = kzalloc(sizeof(struct xfs_quotainfo), + GFP_KERNEL | __GFP_NOFAIL); error = list_lru_init(&qinf->qi_lru); if (error) diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 20ad8086da60..78d0cda60abf 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -143,8 +143,8 @@ xfs_cui_init( ASSERT(nextents > 0); if (nextents > XFS_CUI_MAX_FAST_EXTENTS) - cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), - 0); + cuip = kzalloc(xfs_cui_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); else cuip = kmem_cache_zalloc(xfs_cui_cache, GFP_KERNEL | __GFP_NOFAIL); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 79ad0087aeca..31a921fc34b2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -142,7 +142,8 @@ xfs_rui_init( ASSERT(nextents > 0); if (nextents > XFS_RUI_MAX_FAST_EXTENTS) - ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); + ruip = kzalloc(xfs_rui_log_item_sizeof(nextents), + GFP_KERNEL | __GFP_NOFAIL); else ruip = kmem_cache_zalloc(xfs_rui_cache, GFP_KERNEL | __GFP_NOFAIL); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1098452e7f95..5f206cdb40ff 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -901,7 +901,8 @@ xfs_trans_ail_init( { struct xfs_ail *ailp; - ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); + ailp = kzalloc(sizeof(struct xfs_ail), + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!ailp) return -ENOMEM; -- cgit v1.2.3 From f078d4ea827607867d42fb3b2ef907caf86ce49d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:40 +1100 Subject: xfs: convert kmem_alloc() to kmalloc() kmem_alloc() is just a thin wrapper around kmalloc() these days. Convert everything to use kmalloc() so we can get rid of the wrapper. Note: the transaction region allocation in xlog_add_to_transaction() can be a high order allocation. Converting it to use kmalloc(__GFP_NOFAIL) results in warnings in the page allocation code being triggered because the mm subsystem does not want us to use __GFP_NOFAIL with high order allocations like we've been doing with the kmem_alloc() wrapper for a couple of decades. Hence this specific case gets converted to xlog_kvmalloc() rather than kmalloc() to avoid this issue. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/Makefile | 3 +-- fs/xfs/kmem.c | 30 ---------------------------- fs/xfs/kmem.h | 42 --------------------------------------- fs/xfs/libxfs/xfs_attr_leaf.c | 7 +++---- fs/xfs/libxfs/xfs_btree_staging.c | 4 ++-- fs/xfs/libxfs/xfs_da_btree.c | 3 ++- fs/xfs/libxfs/xfs_dir2.c | 2 +- fs/xfs/libxfs/xfs_dir2_block.c | 2 +- fs/xfs/libxfs/xfs_dir2_sf.c | 8 ++++---- fs/xfs/libxfs/xfs_inode_fork.c | 15 +++++++------- fs/xfs/xfs_attr_list.c | 2 +- fs/xfs/xfs_buf.c | 6 +++--- fs/xfs/xfs_buf_item_recover.c | 2 +- fs/xfs/xfs_filestream.c | 2 +- fs/xfs/xfs_inode_item_recover.c | 3 ++- fs/xfs/xfs_iwalk.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_qm.c | 3 ++- fs/xfs/xfs_rtalloc.c | 2 +- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_trace.h | 25 ----------------------- 21 files changed, 36 insertions(+), 131 deletions(-) delete mode 100644 fs/xfs/kmem.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index fbe3cdc79036..35a23427055b 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -92,8 +92,7 @@ xfs-y += xfs_aops.o \ xfs_symlink.o \ xfs_sysfs.o \ xfs_trans.o \ - xfs_xattr.o \ - kmem.o + xfs_xattr.o # low-level transaction/log code xfs-y += xfs_log.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c deleted file mode 100644 index c557a030acfe..000000000000 --- a/fs/xfs/kmem.c +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#include "xfs.h" -#include "xfs_message.h" -#include "xfs_trace.h" - -void * -kmem_alloc(size_t size, xfs_km_flags_t flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - trace_kmem_alloc(size, flags, _RET_IP_); - - do { - ptr = kmalloc(size, lflags); - if (ptr || (flags & KM_MAYFAIL)) - return ptr; - if (!(++retries % 100)) - xfs_err(NULL, - "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", - current->comm, current->pid, - (unsigned int)size, __func__, lflags); - memalloc_retry_wait(lflags); - } while (1); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index bce31182c9e8..1343f1a6f99b 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -15,48 +15,6 @@ * General memory allocation interfaces */ -typedef unsigned __bitwise xfs_km_flags_t; -#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) -#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) -#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) -#define KM_NOLOCKDEP ((__force xfs_km_flags_t)0x0020u) - -/* - * We use a special process flag to avoid recursive callbacks into - * the filesystem during transactions. We will also issue our own - * warnings, so we explicitly skip any generic ones (silly of us). - */ -static inline gfp_t -kmem_flags_convert(xfs_km_flags_t flags) -{ - gfp_t lflags; - - BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP)); - - lflags = GFP_KERNEL | __GFP_NOWARN; - if (flags & KM_NOFS) - lflags &= ~__GFP_FS; - - /* - * Default page/slab allocator behavior is to retry for ever - * for small allocations. We can override this behavior by using - * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long - * as it is feasible but rather fail than retry forever for all - * request sizes. - */ - if (flags & KM_MAYFAIL) - lflags |= __GFP_RETRY_MAYFAIL; - - if (flags & KM_ZERO) - lflags |= __GFP_ZERO; - - if (flags & KM_NOLOCKDEP) - lflags |= __GFP_NOLOCKDEP; - - return lflags; -} - -extern void *kmem_alloc(size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index ab4223bf51ee..033382cf514d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -879,8 +879,7 @@ xfs_attr_shortform_to_leaf( trace_xfs_attr_sf_to_leaf(args); - tmpbuffer = kmem_alloc(size, 0); - ASSERT(tmpbuffer != NULL); + tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, ifp->if_data, size); sf = (struct xfs_attr_sf_hdr *)tmpbuffer; @@ -1059,7 +1058,7 @@ xfs_attr3_leaf_to_shortform( trace_xfs_attr_leaf_to_sf(args); - tmpbuffer = kmem_alloc(args->geo->blksize, 0); + tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); if (!tmpbuffer) return -ENOMEM; @@ -1533,7 +1532,7 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); - tmpbuffer = kmem_alloc(args->geo->blksize, 0); + tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); memset(bp->b_addr, 0, args->geo->blksize); leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index eff29425fd76..065e4a00a2f4 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -139,7 +139,7 @@ xfs_btree_stage_afakeroot( ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); ASSERT(cur->bc_tp == NULL); - nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_NOFS | __GFP_NOFAIL); memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); nops->alloc_block = xfs_btree_fakeroot_alloc_block; nops->free_block = xfs_btree_fakeroot_free_block; @@ -220,7 +220,7 @@ xfs_btree_stage_ifakeroot( ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS); + nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_NOFS | __GFP_NOFAIL); memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); nops->alloc_block = xfs_btree_fakeroot_alloc_block; nops->free_block = xfs_btree_fakeroot_free_block; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 73aae6543906..331b9251b185 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2182,7 +2182,8 @@ xfs_da_grow_inode_int( * If we didn't get it and the block might work if fragmented, * try without the CONTIG flag. Loop until we get it all. */ - mapp = kmem_alloc(sizeof(*mapp) * count, 0); + mapp = kmalloc(sizeof(*mapp) * count, + GFP_KERNEL | __GFP_NOFAIL); for (b = *bno, mapi = 0; b < *bno + count; ) { c = (int)(*bno + count - b); nmap = min(XFS_BMAP_MAX_NMAP, c); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 54915a302e96..370d67300455 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -333,7 +333,7 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; - args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); + args->value = kmalloc(len, GFP_NOFS | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 3c256d4cc40b..506c65caaec5 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1108,7 +1108,7 @@ xfs_dir2_sf_to_block( * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ - sfp = kmem_alloc(ifp->if_bytes, 0); + sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index e1f83fc7b6ad..7b1f41cff9e0 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -276,7 +276,7 @@ xfs_dir2_block_to_sf( * format the data into. Once we have formatted the data, we can free * the block and copy the formatted data into the inode literal area. */ - sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0); + sfp = kmalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | __GFP_NOFAIL); memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); /* @@ -524,7 +524,7 @@ xfs_dir2_sf_addname_hard( * Copy the old directory to the stack buffer. */ old_isize = (int)dp->i_disk_size; - buf = kmem_alloc(old_isize, 0); + buf = kmalloc(old_isize, GFP_KERNEL | __GFP_NOFAIL); oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, dp->i_df.if_data, old_isize); /* @@ -1151,7 +1151,7 @@ xfs_dir2_sf_toino4( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, 0); + buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL); ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* @@ -1223,7 +1223,7 @@ xfs_dir2_sf_toino8( * Don't want xfs_idata_realloc copying the data here. */ oldsize = dp->i_df.if_bytes; - buf = kmem_alloc(oldsize, 0); + buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL); ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index f4569e18a8d0..f3cf7f933e15 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -50,7 +50,7 @@ xfs_init_local_fork( mem_size++; if (size) { - char *new_data = kmem_alloc(mem_size, KM_NOFS); + char *new_data = kmalloc(mem_size, GFP_NOFS | __GFP_NOFAIL); memcpy(new_data, data, size); if (zero_terminate) @@ -77,7 +77,7 @@ xfs_iformat_local( /* * If the size is unreasonable, then something * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. + * kmalloc() or memcpy() below. */ if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { xfs_warn(ip->i_mount, @@ -116,7 +116,7 @@ xfs_iformat_extents( /* * If the number of extents is unreasonable, then something is wrong and - * we just bail out rather than crash in kmem_alloc() or memcpy() below. + * we just bail out rather than crash in kmalloc() or memcpy() below. */ if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).", @@ -205,7 +205,7 @@ xfs_iformat_btree( } ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_NOFS); + ifp->if_broot = kmalloc(size, GFP_NOFS | __GFP_NOFAIL); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -399,7 +399,8 @@ xfs_iroot_realloc( */ if (ifp->if_broot_bytes == 0) { new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); - ifp->if_broot = kmem_alloc(new_size, KM_NOFS); + ifp->if_broot = kmalloc(new_size, + GFP_NOFS | __GFP_NOFAIL); ifp->if_broot_bytes = (int)new_size; return; } @@ -440,7 +441,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmem_alloc(new_size, KM_NOFS); + new_broot = kmalloc(new_size, GFP_NOFS | __GFP_NOFAIL); /* * First copy over the btree block header. */ @@ -488,7 +489,7 @@ xfs_iroot_realloc( * * If the amount of space needed has decreased below the size of the * inline buffer, then switch to using the inline buffer. Otherwise, - * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * use krealloc() or kmalloc() to adjust the size of the buffer * to what is needed. * * ip -- the inode whose if_data area is changing diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index e368ad671e26..5f7a44d21cc9 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -109,7 +109,7 @@ xfs_attr_shortform_list( * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); + sbp = sbuf = kmalloc(sbsize, GFP_NOFS | __GFP_NOFAIL); /* * Scan the attribute list for the rest of the entries, storing diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 5672665150ea..6f53eb2d3de0 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -325,14 +325,14 @@ xfs_buf_alloc_kmem( struct xfs_buf *bp, xfs_buf_flags_t flags) { - xfs_km_flags_t kmflag_mask = KM_NOFS; + gfp_t gfp_mask = GFP_NOFS | __GFP_NOFAIL; size_t size = BBTOB(bp->b_length); /* Assure zeroed buffer for non-read cases. */ if (!(flags & XBF_READ)) - kmflag_mask |= KM_ZERO; + gfp_mask |= __GFP_ZERO; - bp->b_addr = kmem_alloc(size, kmflag_mask); + bp->b_addr = kmalloc(size, gfp_mask); if (!bp->b_addr) return -ENOMEM; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 43167f543afc..34776f4c05ac 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -85,7 +85,7 @@ xlog_add_buffer_cancelled( return false; } - bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0); + bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL); bcp->bc_blkno = blkno; bcp->bc_len = len; bcp->bc_refcount = 1; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 2fc98d313708..e2a3c8d3fe4f 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -313,7 +313,7 @@ xfs_filestream_create_association( * we return a referenced AG, the allocation can still go ahead just * fine. */ - item = kmem_alloc(sizeof(*item), KM_MAYFAIL); + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!item) goto out_put_fstrms; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 144198a6b270..5d7b937179a0 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -291,7 +291,8 @@ xlog_recover_inode_commit_pass2( if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; } else { - in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); + in_f = kmalloc(sizeof(struct xfs_inode_log_format), + GFP_KERNEL | __GFP_NOFAIL); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 8dbb7c054b28..5dd622aa54c5 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -160,7 +160,7 @@ xfs_iwalk_alloc( /* Allocate a prefetch buffer for inobt records. */ size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); - iwag->recs = kmem_alloc(size, KM_MAYFAIL); + iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (iwag->recs == NULL) return -ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4a27ecdbb546..e3bd503edcab 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2161,7 +2161,7 @@ xlog_recover_add_to_trans( return 0; } - ptr = kmem_alloc(len, 0); + ptr = xlog_kvmalloc(len); memcpy(ptr, dp, len); in_f = (struct xfs_inode_log_format *)ptr; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 9c2528418b18..6b6b96454449 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -997,7 +997,8 @@ xfs_qm_reset_dqcounts_buf( if (qip->i_nblocks == 0) return 0; - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0); + map = kmalloc(XFS_DQITER_MAP_SIZE * sizeof(*map), + GFP_KERNEL | __GFP_NOFAIL); lblkno = 0; maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8649d981a097..8a8d6197203e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -903,7 +903,7 @@ xfs_growfs_rt( /* * Allocate a new (fake) mount/sb. */ - nmp = kmem_alloc(sizeof(*nmp), 0); + nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5a2512d20bd0..d167ba00e7cf 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1987,7 +1987,7 @@ static int xfs_init_fs_context( { struct xfs_mount *mp; - mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO); + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL); if (!mp) return -ENOMEM; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0984a1c884c7..c7e57efe0356 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4040,31 +4040,6 @@ TRACE_EVENT(xfs_pwork_init, __entry->nr_threads, __entry->pid) ) -DECLARE_EVENT_CLASS(xfs_kmem_class, - TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), - TP_ARGS(size, flags, caller_ip), - TP_STRUCT__entry( - __field(ssize_t, size) - __field(int, flags) - __field(unsigned long, caller_ip) - ), - TP_fast_assign( - __entry->size = size; - __entry->flags = flags; - __entry->caller_ip = caller_ip; - ), - TP_printk("size %zd flags 0x%x caller %pS", - __entry->size, - __entry->flags, - (char *)__entry->caller_ip) -) - -#define DEFINE_KMEM_EVENT(name) \ -DEFINE_EVENT(xfs_kmem_class, name, \ - TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ - TP_ARGS(size, flags, caller_ip)) -DEFINE_KMEM_EVENT(kmem_alloc); - TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), TP_ARGS(mp, new_dalign, calc_rootino), -- cgit v1.2.3 From afdc115559c57a48ad697219cec1d16962ad9e30 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:41 +1100 Subject: xfs: move kmem_to_page() Move it to the general xfs linux wrapper header file so we can prepare to remove kmem.h Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/kmem.h | 11 ----------- fs/xfs/xfs_linux.h | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 1343f1a6f99b..48e43f29f2a0 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -20,15 +20,4 @@ static inline void kmem_free(const void *ptr) kvfree(ptr); } -/* - * Zone interfaces - */ -static inline struct page * -kmem_to_page(void *addr) -{ - if (is_vmalloc_addr(addr)) - return vmalloc_to_page(addr); - return virt_to_page(addr); -} - #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index d7873e0360f0..666618b463c9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -269,4 +269,15 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, # define PTR_FMT "%p" #endif +/* + * Helper for IO routines to grab backing pages from allocated kernel memory. + */ +static inline struct page * +kmem_to_page(void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + return virt_to_page(addr); +} + #endif /* __XFS_LINUX__ */ -- cgit v1.2.3 From 49292576136fd2a6b58a51677c53151cf4877fa6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:42 +1100 Subject: xfs: convert kmem_free() for kvmalloc users to kvfree() Start getting rid of kmem_free() by converting all the cases where memory can come from vmalloc interfaces to calling kvfree() directly. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_acl.c | 4 ++-- fs/xfs/xfs_attr_item.c | 4 ++-- fs/xfs/xfs_bmap_item.c | 4 ++-- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_dquot.c | 2 +- fs/xfs/xfs_extfree_item.c | 4 ++-- fs/xfs/xfs_icreate_item.c | 2 +- fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_log.c | 4 ++-- fs/xfs/xfs_log_cil.c | 2 +- fs/xfs/xfs_log_recover.c | 42 +++++++++++++++++++++--------------------- fs/xfs/xfs_refcount_item.c | 4 ++-- fs/xfs/xfs_rmap_item.c | 4 ++-- fs/xfs/xfs_rtalloc.c | 6 +++--- 15 files changed, 44 insertions(+), 44 deletions(-) diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 6b840301817a..4bf69c9c088e 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -167,7 +167,7 @@ xfs_get_acl(struct inode *inode, int type, bool rcu) acl = ERR_PTR(error); } - kmem_free(args.value); + kvfree(args.value); return acl; } @@ -204,7 +204,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) } error = xfs_attr_change(&args); - kmem_free(args.value); + kvfree(args.value); /* * If the attribute didn't exist to start with that's fine. diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2e454a0d6f19..f7ba80d575d4 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -108,7 +108,7 @@ STATIC void xfs_attri_item_free( struct xfs_attri_log_item *attrip) { - kmem_free(attrip->attri_item.li_lv_shadow); + kvfree(attrip->attri_item.li_lv_shadow); xfs_attri_log_nameval_put(attrip->attri_nameval); kmem_cache_free(xfs_attri_cache, attrip); } @@ -251,7 +251,7 @@ static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip) STATIC void xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp) { - kmem_free(attrdp->attrd_item.li_lv_shadow); + kvfree(attrdp->attrd_item.li_lv_shadow); kmem_cache_free(xfs_attrd_cache, attrdp); } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 52fb8a148b7d..029a6a8d0efd 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -40,7 +40,7 @@ STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { - kmem_free(buip->bui_item.li_lv_shadow); + kvfree(buip->bui_item.li_lv_shadow); kmem_cache_free(xfs_bui_cache, buip); } @@ -201,7 +201,7 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); - kmem_free(budp->bud_item.li_lv_shadow); + kvfree(budp->bud_item.li_lv_shadow); kmem_cache_free(xfs_bud_cache, budp); } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index ec93d34188c8..545040c6ae87 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -1044,7 +1044,7 @@ xfs_buf_item_free( struct xfs_buf_log_item *bip) { xfs_buf_item_free_format(bip); - kmem_free(bip->bli_item.li_lv_shadow); + kvfree(bip->bli_item.li_lv_shadow); kmem_cache_free(xfs_buf_item_cache, bip); } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index b4f20d9c8f98..e38fe4c8f25b 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -53,7 +53,7 @@ xfs_qm_dqdestroy( { ASSERT(list_empty(&dqp->q_lru)); - kmem_free(dqp->q_logitem.qli_item.li_lv_shadow); + kvfree(dqp->q_logitem.qli_item.li_lv_shadow); mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 1d1185fca6a5..6062703a2723 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -40,7 +40,7 @@ STATIC void xfs_efi_item_free( struct xfs_efi_log_item *efip) { - kmem_free(efip->efi_item.li_lv_shadow); + kvfree(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else @@ -229,7 +229,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) STATIC void xfs_efd_item_free(struct xfs_efd_log_item *efdp) { - kmem_free(efdp->efd_item.li_lv_shadow); + kvfree(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index b05314d48176..4345db501714 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -63,7 +63,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { - kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow); + kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow); kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 0aee97ba0be8..bfbeafc8e120 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -856,7 +856,7 @@ xfs_inode_item_destroy( ASSERT(iip->ili_item.li_buf == NULL); ip->i_itemp = NULL; - kmem_free(iip->ili_item.li_lv_shadow); + kvfree(iip->ili_item.li_lv_shadow); kmem_cache_free(xfs_ili_cache, iip); } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f02b6e558af5..45fb169bd819 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -493,7 +493,7 @@ xfs_attrmulti_attr_get( error = -EFAULT; out_kfree: - kmem_free(args.value); + kvfree(args.value); return error; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d38cfaadc726..0009ffbec932 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1662,7 +1662,7 @@ out_destroy_workqueue: out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; - kmem_free(iclog->ic_data); + kvfree(iclog->ic_data); kmem_free(iclog); if (prev_iclog == log->l_iclog) break; @@ -2119,7 +2119,7 @@ xlog_dealloc_log( iclog = log->l_iclog; for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; - kmem_free(iclog->ic_data); + kvfree(iclog->ic_data); kmem_free(iclog); iclog = next_iclog; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 3c705f22b0ab..2c0512916cc9 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -339,7 +339,7 @@ xlog_cil_alloc_shadow_bufs( * the buffer, only the log vector header and the iovec * storage. */ - kmem_free(lip->li_lv_shadow); + kvfree(lip->li_lv_shadow); lv = xlog_kvmalloc(buf_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e3bd503edcab..295306ef6959 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -361,7 +361,7 @@ xlog_find_verify_cycle( *new_blk = -1; out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -477,7 +477,7 @@ xlog_find_verify_log_record( *last_blk = i; out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -731,7 +731,7 @@ validate_head: goto out_free_buffer; } - kmem_free(buffer); + kvfree(buffer); if (head_blk == log_bbnum) *return_head_blk = 0; else @@ -745,7 +745,7 @@ validate_head: return 0; out_free_buffer: - kmem_free(buffer); + kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to find log head"); return error; @@ -999,7 +999,7 @@ xlog_verify_tail( "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", orig_tail, *tail_blk); out: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -1046,7 +1046,7 @@ xlog_verify_head( error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, XLOG_MAX_ICLOGS, tmp_buffer, &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped); - kmem_free(tmp_buffer); + kvfree(tmp_buffer); if (error < 0) return error; @@ -1365,7 +1365,7 @@ xlog_find_tail( error = xlog_clear_stale_blocks(log, tail_lsn); done: - kmem_free(buffer); + kvfree(buffer); if (error) xfs_warn(log->l_mp, "failed to locate log tail"); @@ -1399,6 +1399,7 @@ xlog_find_zeroed( xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; int error, log_bbnum = log->l_logBBsize; + int ret = 1; *blk_no = 0; @@ -1413,8 +1414,7 @@ xlog_find_zeroed( first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; - kmem_free(buffer); - return 1; + goto out_free_buffer; } /* check partially zeroed log */ @@ -1424,8 +1424,8 @@ xlog_find_zeroed( last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ - kmem_free(buffer); - return 0; + ret = 0; + goto out_free_buffer; } /* we have a partially zeroed log */ @@ -1471,10 +1471,10 @@ xlog_find_zeroed( *blk_no = last_blk; out_free_buffer: - kmem_free(buffer); + kvfree(buffer); if (error) return error; - return 1; + return ret; } /* @@ -1583,7 +1583,7 @@ xlog_write_log_records( } out_free_buffer: - kmem_free(buffer); + kvfree(buffer); return error; } @@ -2183,7 +2183,7 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); - kmem_free(ptr); + kvfree(ptr); return -EFSCORRUPTED; } @@ -2197,7 +2197,7 @@ xlog_recover_add_to_trans( "log item region count (%d) overflowed size (%d)", item->ri_cnt, item->ri_total); ASSERT(0); - kmem_free(ptr); + kvfree(ptr); return -EFSCORRUPTED; } @@ -2227,7 +2227,7 @@ xlog_recover_free_trans( /* Free the regions in the item. */ list_del(&item->ri_list); for (i = 0; i < item->ri_cnt; i++) - kmem_free(item->ri_buf[i].i_addr); + kvfree(item->ri_buf[i].i_addr); /* Free the item itself */ kmem_free(item->ri_buf); kmem_free(item); @@ -3024,7 +3024,7 @@ xlog_do_recovery_pass( hblks = xlog_logrec_hblks(log, rhead); if (hblks != 1) { - kmem_free(hbp); + kvfree(hbp); hbp = xlog_alloc_buffer(log, hblks); } } else { @@ -3038,7 +3038,7 @@ xlog_do_recovery_pass( return -ENOMEM; dbp = xlog_alloc_buffer(log, BTOBB(h_size)); if (!dbp) { - kmem_free(hbp); + kvfree(hbp); return -ENOMEM; } @@ -3199,9 +3199,9 @@ xlog_do_recovery_pass( } bread_err2: - kmem_free(dbp); + kvfree(dbp); bread_err1: - kmem_free(hbp); + kvfree(hbp); /* * Submit buffers that have been added from the last record processed, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 78d0cda60abf..a9b322e23cfb 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -36,7 +36,7 @@ STATIC void xfs_cui_item_free( struct xfs_cui_log_item *cuip) { - kmem_free(cuip->cui_item.li_lv_shadow); + kvfree(cuip->cui_item.li_lv_shadow); if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else @@ -207,7 +207,7 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); - kmem_free(cudp->cud_item.li_lv_shadow); + kvfree(cudp->cud_item.li_lv_shadow); kmem_cache_free(xfs_cud_cache, cudp); } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 31a921fc34b2..489ca8c0e1dc 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -36,7 +36,7 @@ STATIC void xfs_rui_item_free( struct xfs_rui_log_item *ruip) { - kmem_free(ruip->rui_item.li_lv_shadow); + kvfree(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else @@ -206,7 +206,7 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); - kmem_free(rudp->rud_item.li_lv_shadow); + kvfree(rudp->rud_item.li_lv_shadow); kmem_cache_free(xfs_rud_cache, rudp); } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8a8d6197203e..57ed9baaf156 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1059,10 +1059,10 @@ out_free: */ if (rsum_cache != mp->m_rsum_cache) { if (error) { - kmem_free(mp->m_rsum_cache); + kvfree(mp->m_rsum_cache); mp->m_rsum_cache = rsum_cache; } else { - kmem_free(rsum_cache); + kvfree(rsum_cache); } } @@ -1233,7 +1233,7 @@ void xfs_rtunmount_inodes( struct xfs_mount *mp) { - kmem_free(mp->m_rsum_cache); + kvfree(mp->m_rsum_cache); if (mp->m_rbmip) xfs_irele(mp->m_rbmip); if (mp->m_rsumip) -- cgit v1.2.3 From d4c75a1b40cd036a84d98e2711db9cf30eaaaf5f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:43 +1100 Subject: xfs: convert remaining kmem_free() to kfree() The remaining callers of kmem_free() are freeing heap memory, so we can convert them directly to kfree() and get rid of kmem_free() altogether. This conversion was done with: $ for f in `git grep -l kmem_free fs/xfs`; do > sed -i s/kmem_free/kfree/ $f > done $ Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/kmem.h | 23 ----------------------- fs/xfs/libxfs/xfs_ag.c | 6 +++--- fs/xfs/libxfs/xfs_attr_leaf.c | 8 ++++---- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/libxfs/xfs_btree_staging.c | 4 ++-- fs/xfs/libxfs/xfs_da_btree.c | 10 +++++----- fs/xfs/libxfs/xfs_defer.c | 4 ++-- fs/xfs/libxfs/xfs_dir2.c | 18 +++++++++--------- fs/xfs/libxfs/xfs_dir2_block.c | 4 ++-- fs/xfs/libxfs/xfs_dir2_sf.c | 8 ++++---- fs/xfs/libxfs/xfs_iext_tree.c | 8 ++++---- fs/xfs/libxfs/xfs_inode_fork.c | 6 +++--- fs/xfs/scrub/cow_repair.c | 2 +- fs/xfs/xfs_attr_item.c | 2 +- fs/xfs/xfs_attr_list.c | 4 ++-- fs/xfs/xfs_buf.c | 12 ++++++------ fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_buf_item_recover.c | 6 +++--- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_error.c | 4 ++-- fs/xfs/xfs_extent_busy.c | 2 +- fs/xfs/xfs_extfree_item.c | 4 ++-- fs/xfs/xfs_filestream.c | 4 ++-- fs/xfs/xfs_inode.c | 4 ++-- fs/xfs/xfs_inode_item_recover.c | 2 +- fs/xfs/xfs_ioctl.c | 6 +++--- fs/xfs/xfs_iops.c | 2 +- fs/xfs/xfs_itable.c | 4 ++-- fs/xfs/xfs_iwalk.c | 4 ++-- fs/xfs/xfs_linux.h | 3 +-- fs/xfs/xfs_log.c | 8 ++++---- fs/xfs/xfs_log_cil.c | 14 +++++++------- fs/xfs/xfs_log_recover.c | 6 +++--- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mru_cache.c | 8 ++++---- fs/xfs/xfs_qm.c | 6 +++--- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- fs/xfs/xfs_rtalloc.c | 2 +- fs/xfs/xfs_super.c | 2 +- fs/xfs/xfs_trans_ail.c | 4 ++-- 41 files changed, 101 insertions(+), 125 deletions(-) delete mode 100644 fs/xfs/kmem.h diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h deleted file mode 100644 index 48e43f29f2a0..000000000000 --- a/fs/xfs/kmem.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#ifndef __XFS_SUPPORT_KMEM_H__ -#define __XFS_SUPPORT_KMEM_H__ - -#include -#include -#include -#include - -/* - * General memory allocation interfaces - */ - -static inline void kmem_free(const void *ptr) -{ - kvfree(ptr); -} - -#endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 96a6bfd58931..937ea48d5cc0 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -241,7 +241,7 @@ __xfs_free_perag( struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); ASSERT(!delayed_work_pending(&pag->pag_blockgc_work)); - kmem_free(pag); + kfree(pag); } /* @@ -353,7 +353,7 @@ xfs_free_unused_perag_range( break; xfs_buf_hash_destroy(pag); xfs_defer_drain_free(&pag->pag_intents_drain); - kmem_free(pag); + kfree(pag); } } @@ -453,7 +453,7 @@ out_remove_pag: radix_tree_delete(&mp->m_perag_tree, index); spin_unlock(&mp->m_perag_lock); out_free_pag: - kmem_free(pag); + kfree(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ xfs_free_unused_perag_range(mp, first_initialised, agcount); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 033382cf514d..192d9938a231 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -923,7 +923,7 @@ xfs_attr_shortform_to_leaf( } error = 0; out: - kmem_free(tmpbuffer); + kfree(tmpbuffer); return error; } @@ -1124,7 +1124,7 @@ xfs_attr3_leaf_to_shortform( error = 0; out: - kmem_free(tmpbuffer); + kfree(tmpbuffer); return error; } @@ -1570,7 +1570,7 @@ xfs_attr3_leaf_compact( */ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); - kmem_free(tmpbuffer); + kfree(tmpbuffer); } /* @@ -2290,7 +2290,7 @@ xfs_attr3_leaf_unbalance( } memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); savehdr = tmphdr; /* struct copy */ - kmem_free(tmp_leaf); + kfree(tmp_leaf); } xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ea8d3659df20..1adfc35c99c9 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -451,7 +451,7 @@ xfs_btree_del_cursor( ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) - kmem_free(cur->bc_ops); + kfree(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) xfs_perag_put(cur->bc_ag.pag); kmem_cache_free(cur->bc_cache, cur); diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 065e4a00a2f4..961f6b898f4b 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -171,7 +171,7 @@ xfs_btree_commit_afakeroot( trace_xfs_btree_commit_afakeroot(cur); - kmem_free((void *)cur->bc_ops); + kfree((void *)cur->bc_ops); cur->bc_ag.agbp = agbp; cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; @@ -254,7 +254,7 @@ xfs_btree_commit_ifakeroot( trace_xfs_btree_commit_ifakeroot(cur); - kmem_free((void *)cur->bc_ops); + kfree((void *)cur->bc_ops); cur->bc_ino.ifake = NULL; cur->bc_ino.whichfork = whichfork; cur->bc_ops = ops; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 331b9251b185..3383b4525381 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2220,7 +2220,7 @@ xfs_da_grow_inode_int( out_free_map: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2559,7 +2559,7 @@ xfs_dabuf_map( *nmaps = nirecs; out_free_irecs: if (irecs != &irec) - kmem_free(irecs); + kfree(irecs); return error; invalid_mapping: @@ -2615,7 +2615,7 @@ xfs_da_get_buf( out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2656,7 +2656,7 @@ xfs_da_read_buf( *bpp = bp; out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } @@ -2687,7 +2687,7 @@ xfs_da_reada_buf( out_free: if (mapp != &map) - kmem_free(mapp); + kfree(mapp); return error; } diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 07d318b1f807..75689c151a54 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -1038,7 +1038,7 @@ xfs_defer_ops_capture_abort( for (i = 0; i < dfc->dfc_held.dr_inos; i++) xfs_irele(dfc->dfc_held.dr_ip[i]); - kmem_free(dfc); + kfree(dfc); } /* @@ -1114,7 +1114,7 @@ xfs_defer_ops_continue( list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); tp->t_flags |= dfc->dfc_tpflags; - kmem_free(dfc); + kfree(dfc); } /* Release the resources captured and continued during recovery. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 370d67300455..e60aa8f8d0a7 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -109,8 +109,8 @@ xfs_da_mount( mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!mp->m_dir_geo || !mp->m_attr_geo) { - kmem_free(mp->m_dir_geo); - kmem_free(mp->m_attr_geo); + kfree(mp->m_dir_geo); + kfree(mp->m_attr_geo); return -ENOMEM; } @@ -178,8 +178,8 @@ void xfs_da_unmount( struct xfs_mount *mp) { - kmem_free(mp->m_dir_geo); - kmem_free(mp->m_attr_geo); + kfree(mp->m_dir_geo); + kfree(mp->m_attr_geo); } /* @@ -244,7 +244,7 @@ xfs_dir_init( args->dp = dp; args->trans = tp; error = xfs_dir2_sf_create(args, pdp->i_ino); - kmem_free(args); + kfree(args); return error; } @@ -313,7 +313,7 @@ xfs_dir_createname( rval = xfs_dir2_node_addname(args); out_free: - kmem_free(args); + kfree(args); return rval; } @@ -419,7 +419,7 @@ out_check_rval: } out_free: xfs_iunlock(dp, lock_mode); - kmem_free(args); + kfree(args); return rval; } @@ -477,7 +477,7 @@ xfs_dir_removename( else rval = xfs_dir2_node_removename(args); out_free: - kmem_free(args); + kfree(args); return rval; } @@ -538,7 +538,7 @@ xfs_dir_replace( else rval = xfs_dir2_node_replace(args); out_free: - kmem_free(args); + kfree(args); return rval; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 506c65caaec5..fde46081a824 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1253,7 +1253,7 @@ xfs_dir2_sf_to_block( sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } /* Done with the temporary buffer */ - kmem_free(sfp); + kfree(sfp); /* * Sort the leaf entries by hash value. */ @@ -1268,6 +1268,6 @@ xfs_dir2_sf_to_block( xfs_dir3_data_check(dp, bp); return 0; out_free: - kmem_free(sfp); + kfree(sfp); return error; } diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 7b1f41cff9e0..17a20384c8b7 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -350,7 +350,7 @@ xfs_dir2_block_to_sf( xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(sfp); + kfree(sfp); return error; } @@ -576,7 +576,7 @@ xfs_dir2_sf_addname_hard( sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } - kmem_free(buf); + kfree(buf); dp->i_disk_size = new_isize; xfs_dir2_sf_check(args); } @@ -1190,7 +1190,7 @@ xfs_dir2_sf_toino4( /* * Clean up the inode. */ - kmem_free(buf); + kfree(buf); dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } @@ -1262,7 +1262,7 @@ xfs_dir2_sf_toino8( /* * Clean up the inode. */ - kmem_free(buf); + kfree(buf); dp->i_disk_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 4522f3c7a23f..16f18b08fe4c 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -747,7 +747,7 @@ xfs_iext_remove_node( again: ASSERT(node->ptrs[pos]); ASSERT(node->ptrs[pos] == victim); - kmem_free(victim); + kfree(victim); nr_entries = xfs_iext_node_nr_entries(node, pos) - 1; offset = node->keys[0]; @@ -793,7 +793,7 @@ again: ASSERT(node == ifp->if_data); ifp->if_data = node->ptrs[0]; ifp->if_height--; - kmem_free(node); + kfree(node); } } @@ -867,7 +867,7 @@ xfs_iext_free_last_leaf( struct xfs_ifork *ifp) { ifp->if_height--; - kmem_free(ifp->if_data); + kfree(ifp->if_data); ifp->if_data = NULL; } @@ -1048,7 +1048,7 @@ xfs_iext_destroy_node( } } - kmem_free(node); + kfree(node); } void diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index f3cf7f933e15..f6d5b86b608d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -471,7 +471,7 @@ xfs_iroot_realloc( (int)new_size); memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t)); } - kmem_free(ifp->if_broot); + kfree(ifp->if_broot); ifp->if_broot = new_broot; ifp->if_broot_bytes = (int)new_size; if (ifp->if_broot) @@ -525,13 +525,13 @@ xfs_idestroy_fork( struct xfs_ifork *ifp) { if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot); + kfree(ifp->if_broot); ifp->if_broot = NULL; } switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: - kmem_free(ifp->if_data); + kfree(ifp->if_data); ifp->if_data = NULL; break; case XFS_DINODE_FMT_EXTENTS: diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c index 1e82c727af8e..4de3f0f40f48 100644 --- a/fs/xfs/scrub/cow_repair.c +++ b/fs/xfs/scrub/cow_repair.c @@ -609,6 +609,6 @@ xrep_bmap_cow( out_bitmap: xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); xoff_bitmap_destroy(&xc->bad_fileoffs); - kmem_free(xc); + kfree(xc); return error; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index f7ba80d575d4..2a142cefdc3d 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -386,7 +386,7 @@ xfs_attr_free_item( xfs_da_state_free(attr->xattri_da_state); xfs_attri_log_nameval_put(attr->xattri_nameval); if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY) - kmem_free(attr); + kfree(attr); else kmem_cache_free(xfs_attr_intent_cache, attr); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 5f7a44d21cc9..0318d768520a 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -124,7 +124,7 @@ xfs_attr_shortform_list( XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, sizeof(*sfe)); - kmem_free(sbuf); + kfree(sbuf); return -EFSCORRUPTED; } @@ -188,7 +188,7 @@ xfs_attr_shortform_list( cursor->offset++; } out: - kmem_free(sbuf); + kfree(sbuf); return error; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6f53eb2d3de0..10bbde3ce7cc 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -204,7 +204,7 @@ xfs_buf_free_maps( struct xfs_buf *bp) { if (bp->b_maps != &bp->__b_map) { - kmem_free(bp->b_maps); + kfree(bp->b_maps); bp->b_maps = NULL; } } @@ -289,7 +289,7 @@ xfs_buf_free_pages( mm_account_reclaimed_pages(bp->b_page_count); if (bp->b_pages != bp->b_page_array) - kmem_free(bp->b_pages); + kfree(bp->b_pages); bp->b_pages = NULL; bp->b_flags &= ~_XBF_PAGES; } @@ -315,7 +315,7 @@ xfs_buf_free( if (bp->b_flags & _XBF_PAGES) xfs_buf_free_pages(bp); else if (bp->b_flags & _XBF_KMEM) - kmem_free(bp->b_addr); + kfree(bp->b_addr); call_rcu(&bp->b_rcu, xfs_buf_free_callback); } @@ -339,7 +339,7 @@ xfs_buf_alloc_kmem( if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != ((unsigned long)bp->b_addr & PAGE_MASK)) { /* b_addr spans two pages - use alloc_page instead */ - kmem_free(bp->b_addr); + kfree(bp->b_addr); bp->b_addr = NULL; return -ENOMEM; } @@ -1953,7 +1953,7 @@ xfs_free_buftarg( if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) bdev_release(btp->bt_bdev_handle); - kmem_free(btp); + kfree(btp); } int @@ -2045,7 +2045,7 @@ error_pcpu: error_lru: list_lru_destroy(&btp->bt_lru); error_free: - kmem_free(btp); + kfree(btp); return NULL; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 545040c6ae87..43031842341a 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -814,7 +814,7 @@ xfs_buf_item_free_format( struct xfs_buf_log_item *bip) { if (bip->bli_formats != &bip->__bli_format) { - kmem_free(bip->bli_formats); + kfree(bip->bli_formats); bip->bli_formats = NULL; } } diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 34776f4c05ac..09e893cf563c 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -129,7 +129,7 @@ xlog_put_buffer_cancelled( if (--bcp->bc_refcount == 0) { list_del(&bcp->bc_list); - kmem_free(bcp); + kfree(bcp); } return true; } @@ -1062,10 +1062,10 @@ xlog_free_buf_cancel_table( &log->l_buf_cancel_table[i], struct xfs_buf_cancel, bc_list))) { list_del(&bc->bc_list); - kmem_free(bc); + kfree(bc); } } - kmem_free(log->l_buf_cancel_table); + kfree(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index d5787991bb5b..8539f5c9a774 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -79,7 +79,7 @@ xfs_discard_endio_work( container_of(work, struct xfs_busy_extents, endio_work); xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); - kmem_free(extents->owner); + kfree(extents->owner); } /* diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 456520d60cd0..7ad0e92c6b5b 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -248,7 +248,7 @@ xfs_errortag_init( ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype, &mp->m_kobj, "errortag"); if (ret) - kmem_free(mp->m_errortag); + kfree(mp->m_errortag); return ret; } @@ -257,7 +257,7 @@ xfs_errortag_del( struct xfs_mount *mp) { xfs_sysfs_del(&mp->m_errortag_kobj); - kmem_free(mp->m_errortag); + kfree(mp->m_errortag); } static bool diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index b90c3dd43e03..56cfa1498571 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -531,7 +531,7 @@ xfs_extent_busy_clear_one( } list_del_init(&busyp->list); - kmem_free(busyp); + kfree(busyp); } static void diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6062703a2723..8c382f092332 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -42,7 +42,7 @@ xfs_efi_item_free( { kvfree(efip->efi_item.li_lv_shadow); if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) - kmem_free(efip); + kfree(efip); else kmem_cache_free(xfs_efi_cache, efip); } @@ -231,7 +231,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) { kvfree(efdp->efd_item.li_lv_shadow); if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) - kmem_free(efdp); + kfree(efdp); else kmem_cache_free(xfs_efd_cache, efdp); } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index e2a3c8d3fe4f..e3aaa0555597 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -44,7 +44,7 @@ xfs_fstrm_free_func( atomic_dec(&pag->pagf_fstrms); xfs_perag_rele(pag); - kmem_free(item); + kfree(item); } /* @@ -326,7 +326,7 @@ xfs_filestream_create_association( out_free_item: xfs_perag_rele(item->pag); - kmem_free(item); + kfree(item); out_put_fstrms: atomic_dec(&args->pag->pagf_fstrms); return 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1fd94958aa97..37ec247edc13 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -671,7 +671,7 @@ xfs_lookup( out_free_name: if (ci_name) - kmem_free(ci_name->name); + kfree(ci_name->name); out_unlock: *ipp = NULL; return error; @@ -2378,7 +2378,7 @@ xfs_ifree( * already been freed by xfs_attr_inactive. */ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - kmem_free(ip->i_df.if_data); + kfree(ip->i_df.if_data); ip->i_df.if_data = NULL; ip->i_df.if_bytes = 0; } diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index 5d7b937179a0..dbdab4ce7c44 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -554,7 +554,7 @@ out_release: xfs_buf_relse(bp); error: if (need_free) - kmem_free(in_f); + kfree(in_f); return error; } diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 45fb169bd819..7eeebcb6b925 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -435,7 +435,7 @@ xfs_ioc_attr_list( copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) error = -EFAULT; out_free: - kmem_free(buffer); + kfree(buffer); return error; } @@ -1506,7 +1506,7 @@ xfs_ioc_getbmap( error = 0; out_free_buf: - kmem_free(buf); + kfree(buf); return error; } @@ -1636,7 +1636,7 @@ xfs_ioc_getfsmap( } out_free: - kmem_free(recs); + kfree(recs); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index a0d77f5f512e..be102fd49560 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -346,7 +346,7 @@ xfs_vn_ci_lookup( dname.name = ci_name.name; dname.len = ci_name.len; dentry = d_add_ci(dentry, VFS_I(ip), &dname); - kmem_free(ci_name.name); + kfree(ci_name.name); return dentry; } diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 14211174267a..95fc31b9f87d 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -214,7 +214,7 @@ xfs_bulkstat_one( breq->startino, &bc); xfs_trans_cancel(tp); out: - kmem_free(bc.buf); + kfree(bc.buf); /* * If we reported one inode to userspace then we abort because we hit @@ -309,7 +309,7 @@ xfs_bulkstat( xfs_bulkstat_iwalk, breq->icount, &bc); xfs_trans_cancel(tp); out: - kmem_free(bc.buf); + kfree(bc.buf); /* * We found some inodes, so clear the error status and return them. diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 5dd622aa54c5..6d2eb6364867 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -172,7 +172,7 @@ STATIC void xfs_iwalk_free( struct xfs_iwalk_ag *iwag) { - kmem_free(iwag->recs); + kfree(iwag->recs); iwag->recs = NULL; } @@ -627,7 +627,7 @@ xfs_iwalk_ag_work( xfs_iwalk_free(iwag); out: xfs_perag_put(iwag->pag); - kmem_free(iwag); + kfree(iwag); return error; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 666618b463c9..caccb7f76690 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -20,8 +20,6 @@ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; #include "xfs_types.h" - -#include "kmem.h" #include "mrlock.h" #include @@ -30,6 +28,7 @@ typedef __u32 xfs_nlink_t; #include #include #include +#include #include #include #include diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0009ffbec932..ee39639bb92b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1663,12 +1663,12 @@ out_free_iclog: for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { prev_iclog = iclog->ic_next; kvfree(iclog->ic_data); - kmem_free(iclog); + kfree(iclog); if (prev_iclog == log->l_iclog) break; } out_free_log: - kmem_free(log); + kfree(log); out: return ERR_PTR(error); } /* xlog_alloc_log */ @@ -2120,13 +2120,13 @@ xlog_dealloc_log( for (i = 0; i < log->l_iclog_bufs; i++) { next_iclog = iclog->ic_next; kvfree(iclog->ic_data); - kmem_free(iclog); + kfree(iclog); iclog = next_iclog; } log->l_mp->m_log = NULL; destroy_workqueue(log->l_ioend_workqueue); - kmem_free(log); + kfree(log); } /* diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 2c0512916cc9..815a2181004c 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -703,7 +703,7 @@ xlog_cil_free_logvec( while (!list_empty(lv_chain)) { lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list); list_del_init(&lv->lv_list); - kmem_free(lv); + kfree(lv); } } @@ -753,7 +753,7 @@ xlog_cil_committed( return; } - kmem_free(ctx); + kfree(ctx); } void @@ -1339,7 +1339,7 @@ xlog_cil_push_work( out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); - kmem_free(new_ctx); + kfree(new_ctx); return; out_abort_free_ticket: @@ -1533,7 +1533,7 @@ xlog_cil_process_intents( set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); trace_xfs_cil_whiteout_mark(ilip); len += ilip->li_lv->lv_bytes; - kmem_free(ilip->li_lv); + kfree(ilip->li_lv); ilip->li_lv = NULL; xfs_trans_del_item(lip); @@ -1786,7 +1786,7 @@ xlog_cil_init( out_destroy_wq: destroy_workqueue(cil->xc_push_wq); out_destroy_cil: - kmem_free(cil); + kfree(cil); return -ENOMEM; } @@ -1799,12 +1799,12 @@ xlog_cil_destroy( if (cil->xc_ctx) { if (cil->xc_ctx->ticket) xfs_log_ticket_put(cil->xc_ctx->ticket); - kmem_free(cil->xc_ctx); + kfree(cil->xc_ctx); } ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); free_percpu(cil->xc_pcp); destroy_workqueue(cil->xc_push_wq); - kmem_free(cil); + kfree(cil); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 295306ef6959..e9ed43a833af 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2229,11 +2229,11 @@ xlog_recover_free_trans( for (i = 0; i < item->ri_cnt; i++) kvfree(item->ri_buf[i].i_addr); /* Free the item itself */ - kmem_free(item->ri_buf); - kmem_free(item); + kfree(item->ri_buf); + kfree(item); } /* Free the transaction recover structure */ - kmem_free(trans); + kfree(trans); } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aabb25dc3efa..7328034d42ed 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -45,7 +45,7 @@ xfs_uuid_table_free(void) { if (xfs_uuid_table_size == 0) return; - kmem_free(xfs_uuid_table); + kfree(xfs_uuid_table); xfs_uuid_table = NULL; xfs_uuid_table_size = 0; } diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index feae3115617b..ce496704748d 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -365,9 +365,9 @@ xfs_mru_cache_create( exit: if (err && mru && mru->lists) - kmem_free(mru->lists); + kfree(mru->lists); if (err && mru) - kmem_free(mru); + kfree(mru); return err; } @@ -407,8 +407,8 @@ xfs_mru_cache_destroy( xfs_mru_cache_flush(mru); - kmem_free(mru->lists); - kmem_free(mru); + kfree(mru->lists); + kfree(mru); } /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6b6b96454449..6ad558ae853a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -701,7 +701,7 @@ out_free_inos: out_free_lru: list_lru_destroy(&qinf->qi_lru); out_free_qinf: - kmem_free(qinf); + kfree(qinf); mp->m_quotainfo = NULL; return error; } @@ -725,7 +725,7 @@ xfs_qm_destroy_quotainfo( xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); mutex_destroy(&qi->qi_quotaofflock); - kmem_free(qi); + kfree(qi); mp->m_quotainfo = NULL; } @@ -1060,7 +1060,7 @@ xfs_qm_reset_dqcounts_buf( } while (nmaps > 0); out: - kmem_free(map); + kfree(map); return error; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index a9b322e23cfb..d850b9685f7f 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -38,7 +38,7 @@ xfs_cui_item_free( { kvfree(cuip->cui_item.li_lv_shadow); if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) - kmem_free(cuip); + kfree(cuip); else kmem_cache_free(xfs_cui_cache, cuip); } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 489ca8c0e1dc..a40b92ac81e8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -38,7 +38,7 @@ xfs_rui_item_free( { kvfree(ruip->rui_item.li_lv_shadow); if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) - kmem_free(ruip); + kfree(ruip); else kmem_cache_free(xfs_rui_cache, ruip); } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 57ed9baaf156..2f85567f3d75 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1050,7 +1050,7 @@ out_free: /* * Free the fake mp structure. */ - kmem_free(nmp); + kfree(nmp); /* * If we had to allocate a new rsum_cache, we either need to free the diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d167ba00e7cf..6ce1e6deb7ec 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -761,7 +761,7 @@ xfs_mount_free( debugfs_remove(mp->m_debugfs); kfree(mp->m_rtname); kfree(mp->m_logname); - kmem_free(mp); + kfree(mp); } STATIC int diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 5f206cdb40ff..e4c343096f95 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -922,7 +922,7 @@ xfs_trans_ail_init( return 0; out_free_ailp: - kmem_free(ailp); + kfree(ailp); return -ENOMEM; } @@ -933,5 +933,5 @@ xfs_trans_ail_destroy( struct xfs_ail *ailp = mp->m_ail; kthread_stop(ailp->ail_task); - kmem_free(ailp); + kfree(ailp); } -- cgit v1.2.3 From 178231af2bdc14954ee300b3b1bd08e06a1333ea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:44 +1100 Subject: xfs: use an empty transaction for fstrim We currently use a btree walk in the fstrim code. This requires a btree cursor and btree cursors are only used inside transactions except for the fstrim code. This means that all the btree operations that allocate memory operate in both GFP_KERNEL and GFP_NOFS contexts. This causes problems with lockdep being unable to determine the difference between objects that are safe to lock both above and below memory reclaim. Free space btree buffers are definitely locked both above and below reclaim and that means we have to mark all btree infrastructure allocations with GFP_NOFS to avoid potential lockdep false positives. If we wrap this btree walk in an empty cursor, all btree walks are now done under transaction context and so all allocations inherit GFP_NOFS context from the tranaction. This enables us to move all the btree allocations to GFP_KERNEL context and hence help remove the explicit use of GFP_NOFS in XFS. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_discard.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 8539f5c9a774..299b8f907292 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -8,6 +8,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_trans.h" #include "xfs_mount.h" #include "xfs_btree.h" #include "xfs_alloc_btree.h" @@ -120,7 +121,7 @@ xfs_discard_extents( error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, &bio); + GFP_KERNEL, &bio); if (error && error != -EOPNOTSUPP) { xfs_info(mp, "discard failed for extent [0x%llx,%u], error %d", @@ -155,6 +156,7 @@ xfs_trim_gather_extents( uint64_t *blocks_trimmed) { struct xfs_mount *mp = pag->pag_mount; + struct xfs_trans *tp; struct xfs_btree_cur *cur; struct xfs_buf *agbp; int error; @@ -168,11 +170,15 @@ xfs_trim_gather_extents( */ xfs_log_force(mp, XFS_LOG_SYNC); - error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); + error = xfs_trans_alloc_empty(mp, &tp); if (error) return error; - cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); + if (error) + goto out_trans_cancel; + + cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); /* * Look up the extent length requested in the AGF and start with it. @@ -279,7 +285,8 @@ next_extent: xfs_extent_busy_clear(mp, &extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); - xfs_buf_relse(agbp); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } -- cgit v1.2.3 From 94a69db2367efcd7e0eeb5d4603340aff1d3c340 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:45 +1100 Subject: xfs: use __GFP_NOLOCKDEP instead of GFP_NOFS In the past we've had problems with lockdep false positives stemming from inode locking occurring in memory reclaim contexts (e.g. from superblock shrinkers). Lockdep doesn't know that inodes access from above memory reclaim cannot be accessed from below memory reclaim (and vice versa) but there has never been a good solution to solving this problem with lockdep annotations. This situation isn't unique to inode locks - buffers are also locked above and below memory reclaim, and we have to maintain lock ordering for them - and against inodes - appropriately. IOWs, the same code paths and locks are taken both above and below memory reclaim and so we always need to make sure the lock orders are consistent. We are spared the lockdep problems this might cause by the fact that semaphores and bit locks aren't covered by lockdep. In general, this sort of lockdep false positive detection is cause by code that runs GFP_KERNEL memory allocation with an actively referenced inode locked. When it is run from a transaction, memory allocation is automatically GFP_NOFS, so we don't have reclaim recursion issues. So in the places where we do memory allocation with inodes locked outside of a transaction, we have explicitly set them to use GFP_NOFS allocations to prevent lockdep false positives from being reported if the allocation dips into direct memory reclaim. More recently, __GFP_NOLOCKDEP was added to the memory allocation flags to tell lockdep not to track that particular allocation for the purposes of reclaim recursion detection. This is a much better way of preventing false positives - it allows us to use GFP_KERNEL context outside of transactions, and allows direct memory reclaim to proceed normally without throwing out false positive deadlock warnings. The obvious places that lock inodes and do memory allocation are the lookup paths and inode extent list initialisation. These occur in non-transactional GFP_KERNEL contexts, and so can run direct reclaim and lock inodes. This patch makes a first path through all the explicit GFP_NOFS allocations in XFS and converts the obvious ones to GFP_KERNEL | __GFP_NOLOCKDEP as a first step towards removing explicit GFP_NOFS allocations from the XFS code. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_btree.h | 4 +++- fs/xfs/libxfs/xfs_da_btree.c | 8 +++++--- fs/xfs/libxfs/xfs_dir2.c | 14 ++++---------- fs/xfs/libxfs/xfs_iext_tree.c | 22 +++++++++++++--------- fs/xfs/libxfs/xfs_inode_fork.c | 8 +++++--- fs/xfs/xfs_icache.c | 5 ++--- fs/xfs/xfs_qm.c | 6 +++--- 8 files changed, 36 insertions(+), 33 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 937ea48d5cc0..036f4ee43fd3 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -389,7 +389,7 @@ xfs_initialize_perag( pag->pag_agno = index; pag->pag_mount = mp; - error = radix_tree_preload(GFP_NOFS); + error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error) goto out_free_pag; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index d906324e25c8..75a0e2c8e115 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -725,7 +725,9 @@ xfs_btree_alloc_cursor( { struct xfs_btree_cur *cur; - cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL); + /* BMBT allocations can come through from non-transactional context. */ + cur = kmem_cache_zalloc(cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 3383b4525381..444ec1560f43 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -85,7 +85,8 @@ xfs_da_state_alloc( { struct xfs_da_state *state; - state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL); + state = kmem_cache_zalloc(xfs_da_state_cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); state->args = args; state->mp = args->dp->i_mount; return state; @@ -2519,7 +2520,8 @@ xfs_dabuf_map( int error = 0, nirecs, i; if (nfsb > 1) - irecs = kzalloc(sizeof(irec) * nfsb, GFP_NOFS | __GFP_NOFAIL); + irecs = kzalloc(sizeof(irec) * nfsb, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); nirecs = nfsb; error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs, @@ -2533,7 +2535,7 @@ xfs_dabuf_map( */ if (nirecs > 1) { map = kzalloc(nirecs * sizeof(struct xfs_buf_map), - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); if (!map) { error = -ENOMEM; goto out_free_irecs; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index e60aa8f8d0a7..728f72f0d078 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -333,7 +333,8 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return -EEXIST; - args->value = kmalloc(len, GFP_NOFS | __GFP_RETRY_MAYFAIL); + args->value = kmalloc(len, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL); if (!args->value) return -ENOMEM; @@ -364,15 +365,8 @@ xfs_dir_lookup( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_lookup); - /* - * We need to use KM_NOFS here so that lockdep will not throw false - * positive deadlock warnings on a non-transactional lookup path. It is - * safe to recurse into inode recalim in that case, but lockdep can't - * easily be taught about it. Hence KM_NOFS avoids having to add more - * lockdep Doing this avoids having to add a bunch of lockdep class - * annotations into the reclaim path for the ilock. - */ - args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); + args = kzalloc(sizeof(*args), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); args->geo = dp->i_mount->m_dir_geo; args->name = name->name; args->namelen = name->len; diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 16f18b08fe4c..8796f2b3e534 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -394,12 +394,18 @@ xfs_iext_leaf_key( return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK; } +static inline void * +xfs_iext_alloc_node( + int size) +{ + return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); +} + static void xfs_iext_grow( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = kzalloc(NODE_SIZE, - GFP_NOFS | __GFP_NOFAIL); + struct xfs_iext_node *node = xfs_iext_alloc_node(NODE_SIZE); int i; if (ifp->if_height == 1) { @@ -455,8 +461,7 @@ xfs_iext_split_node( int *nr_entries) { struct xfs_iext_node *node = *nodep; - struct xfs_iext_node *new = kzalloc(NODE_SIZE, - GFP_NOFS | __GFP_NOFAIL); + struct xfs_iext_node *new = xfs_iext_alloc_node(NODE_SIZE); const int nr_move = KEYS_PER_NODE / 2; int nr_keep = nr_move + (KEYS_PER_NODE & 1); int i = 0; @@ -544,8 +549,7 @@ xfs_iext_split_leaf( int *nr_entries) { struct xfs_iext_leaf *leaf = cur->leaf; - struct xfs_iext_leaf *new = kzalloc(NODE_SIZE, - GFP_NOFS | __GFP_NOFAIL); + struct xfs_iext_leaf *new = xfs_iext_alloc_node(NODE_SIZE); const int nr_move = RECS_PER_LEAF / 2; int nr_keep = nr_move + (RECS_PER_LEAF & 1); int i; @@ -586,8 +590,7 @@ xfs_iext_alloc_root( { ASSERT(ifp->if_bytes == 0); - ifp->if_data = kzalloc(sizeof(struct xfs_iext_rec), - GFP_NOFS | __GFP_NOFAIL); + ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec)); ifp->if_height = 1; /* now that we have a node step into it */ @@ -607,7 +610,8 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL); + new = krealloc(ifp->if_data, new_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); ifp->if_data = new; cur->leaf = new; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index f6d5b86b608d..709fda3d742f 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -50,7 +50,8 @@ xfs_init_local_fork( mem_size++; if (size) { - char *new_data = kmalloc(mem_size, GFP_NOFS | __GFP_NOFAIL); + char *new_data = kmalloc(mem_size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); memcpy(new_data, data, size); if (zero_terminate) @@ -205,7 +206,8 @@ xfs_iformat_btree( } ifp->if_broot_bytes = size; - ifp->if_broot = kmalloc(size, GFP_NOFS | __GFP_NOFAIL); + ifp->if_broot = kmalloc(size, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); ASSERT(ifp->if_broot != NULL); /* * Copy and convert from the on-disk structure @@ -690,7 +692,7 @@ xfs_ifork_init_cow( return; ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index dba514a2c84d..06046827b5fe 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -659,10 +659,9 @@ xfs_iget_cache_miss( /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload - * region. Since we can be called from transaction context, don't - * recurse into the file system. + * region. */ - if (radix_tree_preload(GFP_NOFS)) { + if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) { error = -EAGAIN; goto out_destroy; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6ad558ae853a..5bcd67cae16a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -643,9 +643,9 @@ xfs_qm_init_quotainfo( if (error) goto out_free_lru; - INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); - INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); - INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_KERNEL); + INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_KERNEL); + INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_KERNEL); mutex_init(&qinf->qi_tree_lock); /* mutex used to serialize quotaoffs */ -- cgit v1.2.3 From 0b3a76e955ebe3d71a2bcd5990404ed522b40e17 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:46 +1100 Subject: xfs: use GFP_KERNEL in pure transaction contexts When running in a transaction context, memory allocations are scoped to GFP_NOFS. Hence we don't need to use GFP_NOFS contexts in pure transaction context allocations - GFP_KERNEL will automatically get converted to GFP_NOFS as appropriate. Go through the code and convert all the obvious GFP_NOFS allocations in transaction context to use GFP_KERNEL. This further reduces the explicit use of GFP_NOFS in XFS. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 3 ++- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_defer.c | 6 +++--- fs/xfs/libxfs/xfs_dir2.c | 8 ++++---- fs/xfs/libxfs/xfs_inode_fork.c | 8 ++++---- fs/xfs/libxfs/xfs_refcount.c | 2 +- fs/xfs/libxfs/xfs_rmap.c | 2 +- fs/xfs/xfs_attr_item.c | 4 ++-- fs/xfs/xfs_bmap_util.c | 2 +- fs/xfs/xfs_buf.c | 28 +++++++++++++++++----------- fs/xfs/xfs_log.c | 3 ++- fs/xfs/xfs_mru_cache.c | 2 +- 12 files changed, 39 insertions(+), 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e965a48e7db9..82ab559f1da1 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -891,7 +891,8 @@ xfs_attr_defer_add( struct xfs_attr_intent *new; - new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL); + new = kmem_cache_zalloc(xfs_attr_intent_cache, + GFP_KERNEL | __GFP_NOFAIL); new->xattri_op_flags = op_flags; new->xattri_da_args = args; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f362345467fa..b525524a2da4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6098,7 +6098,7 @@ __xfs_bmap_add( bmap->br_blockcount, bmap->br_state); - bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 75689c151a54..8ae4401f6810 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -825,7 +825,7 @@ xfs_defer_alloc( struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); dfp->dfp_ops = ops; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, &tp->t_dfops); @@ -888,7 +888,7 @@ xfs_defer_start_recovery( struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); dfp->dfp_ops = ops; dfp->dfp_intent = lip; INIT_LIST_HEAD(&dfp->dfp_work); @@ -979,7 +979,7 @@ xfs_defer_ops_capture( return ERR_PTR(error); /* Create an object to capture the defer ops. */ - dfc = kzalloc(sizeof(*dfc), GFP_NOFS | __GFP_NOFAIL); + dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&dfc->dfc_list); INIT_LIST_HEAD(&dfc->dfc_dfops); diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 728f72f0d078..8c9403b33191 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -236,7 +236,7 @@ xfs_dir_init( if (error) return error; - args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -273,7 +273,7 @@ xfs_dir_createname( XFS_STATS_INC(dp->i_mount, xs_dir_create); } - args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -435,7 +435,7 @@ xfs_dir_removename( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); XFS_STATS_INC(dp->i_mount, xs_dir_remove); - args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; @@ -496,7 +496,7 @@ xfs_dir_replace( if (rval) return rval; - args = kzalloc(sizeof(*args), GFP_NOFS | __GFP_NOFAIL); + args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL); if (!args) return -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 709fda3d742f..136d5d7b9de9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -402,7 +402,7 @@ xfs_iroot_realloc( if (ifp->if_broot_bytes == 0) { new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); ifp->if_broot = kmalloc(new_size, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); ifp->if_broot_bytes = (int)new_size; return; } @@ -417,7 +417,7 @@ xfs_iroot_realloc( new_max = cur_max + rec_diff; new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); ifp->if_broot = krealloc(ifp->if_broot, new_size, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, ifp->if_broot_bytes); np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, @@ -443,7 +443,7 @@ xfs_iroot_realloc( else new_size = 0; if (new_size > 0) { - new_broot = kmalloc(new_size, GFP_NOFS | __GFP_NOFAIL); + new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); /* * First copy over the btree block header. */ @@ -512,7 +512,7 @@ xfs_idata_realloc( if (byte_diff) { ifp->if_data = krealloc(ifp->if_data, new_size, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); if (new_size == 0) ifp->if_data = NULL; ifp->if_bytes = new_size; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 6709a7f8bad5..7df52daa22cf 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1449,7 +1449,7 @@ __xfs_refcount_add( blockcount); ri = kmem_cache_alloc(xfs_refcount_intent_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 76bf7f48cb5a..0bd1f47b2c2b 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2559,7 +2559,7 @@ __xfs_rmap_add( bmap->br_blockcount, bmap->br_state); - ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2a142cefdc3d..0bf25a2ba3b6 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -226,7 +226,7 @@ xfs_attri_init( { struct xfs_attri_log_item *attrip; - attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL); + attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_KERNEL | __GFP_NOFAIL); /* * Grab an extra reference to the name/value buffer for this log item. @@ -666,7 +666,7 @@ xfs_attr_create_done( attrip = ATTRI_ITEM(intent); - attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, &xfs_attrd_item_ops); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c2531c28905c..cb2a4b940292 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -66,7 +66,7 @@ xfs_zero_extent( return blkdev_issue_zeroout(target->bt_bdev, block << (mp->m_super->s_blocksize_bits - 9), count_fsb << (mp->m_super->s_blocksize_bits - 9), - GFP_NOFS, 0); + GFP_KERNEL, 0); } /* diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 10bbde3ce7cc..ab7546a9dfd7 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -190,7 +190,7 @@ xfs_buf_get_maps( } bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); if (!bp->b_maps) return -ENOMEM; return 0; @@ -222,7 +222,8 @@ _xfs_buf_alloc( int i; *bpp = NULL; - bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL); + bp = kmem_cache_zalloc(xfs_buf_cache, + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are @@ -325,7 +326,7 @@ xfs_buf_alloc_kmem( struct xfs_buf *bp, xfs_buf_flags_t flags) { - gfp_t gfp_mask = GFP_NOFS | __GFP_NOFAIL; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; size_t size = BBTOB(bp->b_length); /* Assure zeroed buffer for non-read cases. */ @@ -356,13 +357,11 @@ xfs_buf_alloc_pages( struct xfs_buf *bp, xfs_buf_flags_t flags) { - gfp_t gfp_mask = __GFP_NOWARN; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; long filled = 0; if (flags & XBF_READ_AHEAD) gfp_mask |= __GFP_NORETRY; - else - gfp_mask |= GFP_NOFS; /* Make sure that we have a page list */ bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); @@ -429,11 +428,18 @@ _xfs_buf_map_pages( /* * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we are likely to be under - * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOFS to prevent - * memory reclaim re-entering the filesystem here and - * potentially deadlocking. + * pagetables) with GFP_KERNEL, yet we often under a scoped nofs + * context here. Mixing GFP_KERNEL with GFP_NOFS allocations + * from the same call site that can be run from both above and + * below memory reclaim causes lockdep false positives. Hence we + * always need to force this allocation to nofs context because + * we can't pass __GFP_NOLOCKDEP down to auxillary structures to + * prevent false positive lockdep reports. + * + * XXX(dgc): I think dquot reclaim is the only place we can get + * to this function from memory reclaim context now. If we fix + * that like we've fixed inode reclaim to avoid writeback from + * reclaim, this nofs wrapping can go away. */ nofs_flag = memalloc_nofs_save(); do { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ee39639bb92b..1f68569e62ca 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3518,7 +3518,8 @@ xlog_ticket_alloc( struct xlog_ticket *tic; int unit_res; - tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); + tic = kmem_cache_zalloc(xfs_log_ticket_cache, + GFP_KERNEL | __GFP_NOFAIL); unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index ce496704748d..7443debaffd6 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -428,7 +428,7 @@ xfs_mru_cache_insert( if (!mru || !mru->lists) return -EINVAL; - if (radix_tree_preload(GFP_NOFS)) + if (radix_tree_preload(GFP_KERNEL)) return -ENOMEM; INIT_LIST_HEAD(&elem->list_node); -- cgit v1.2.3 From 2c1e31ed5c88fe5dd9d8b6398cb1ec9cfe0f4d9a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:47 +1100 Subject: xfs: place intent recovery under NOFS allocation context When recovery starts processing intents, all of the initial intent allocations are done outside of transaction contexts. That means they need to specifically use GFP_NOFS as we do not want memory reclaim to attempt to run direct reclaim of filesystem objects while we have lots of objects added into deferred operations. Rather than use GFP_NOFS for these specific allocations, just place the entire intent recovery process under NOFS context and we can then just use GFP_KERNEL for these allocations. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_attr_item.c | 2 +- fs/xfs/xfs_bmap_item.c | 3 ++- fs/xfs/xfs_log_recover.c | 18 ++++++++++++++---- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 0bf25a2ba3b6..e14e229fc712 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -513,7 +513,7 @@ xfs_attri_recover_work( return ERR_PTR(error); attr = kzalloc(sizeof(struct xfs_attr_intent) + - sizeof(struct xfs_da_args), GFP_NOFS | __GFP_NOFAIL); + sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL); args = (struct xfs_da_args *)(attr + 1); attr->xattri_da_args = args; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 029a6a8d0efd..e3c58090e976 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -445,7 +445,8 @@ xfs_bui_recover_work( if (error) return ERR_PTR(error); - bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi = kmem_cache_zalloc(xfs_bmap_intent_cache, + GFP_KERNEL | __GFP_NOFAIL); bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index e9ed43a833af..8c1d260bb9e1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3443,12 +3443,19 @@ xlog_recover( * part of recovery so that the root and real-time bitmap inodes can be read in * from disk in between the two stages. This is necessary so that we can free * space in the real-time portion of the file system. + * + * We run this whole process under GFP_NOFS allocation context. We do a + * combination of non-transactional and transactional work, yet we really don't + * want to recurse into the filesystem from direct reclaim during any of this + * processing. This allows all the recovery code run here not to care about the + * memory allocation context it is running in. */ int xlog_recover_finish( struct xlog *log) { - int error; + unsigned int nofs_flags = memalloc_nofs_save(); + int error; error = xlog_recover_process_intents(log); if (error) { @@ -3462,7 +3469,7 @@ xlog_recover_finish( xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return error; + goto out_error; } /* @@ -3483,7 +3490,7 @@ xlog_recover_finish( if (error < 0) { xfs_alert(log->l_mp, "Failed to clear log incompat features on recovery"); - return error; + goto out_error; } } @@ -3508,9 +3515,12 @@ xlog_recover_finish( * and AIL. */ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + goto out_error; } - return 0; +out_error: + memalloc_nofs_restore(nofs_flags); + return error; } void diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d850b9685f7f..14919b33e4fe 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -425,7 +425,7 @@ xfs_cui_recover_work( struct xfs_refcount_intent *ri; ri = kmem_cache_alloc(xfs_refcount_intent_cache, - GFP_NOFS | __GFP_NOFAIL); + GFP_KERNEL | __GFP_NOFAIL); ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; ri->ri_startblock = pmap->pe_startblock; ri->ri_blockcount = pmap->pe_len; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index a40b92ac81e8..e473124e29cc 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -455,7 +455,7 @@ xfs_rui_recover_work( { struct xfs_rmap_intent *ri; - ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { case XFS_RMAP_EXTENT_MAP: -- cgit v1.2.3 From c704ecb2410e3496314136fa3eb748177936d867 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:48 +1100 Subject: xfs: place the CIL under nofs allocation context This is core code that needs to run in low memory conditions and can be triggered from memory reclaim. While it runs in a workqueue, it really shouldn't be recursing back into the filesystem during any memory allocation it needs to function. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log_cil.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 815a2181004c..8c3b09777006 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -100,7 +100,7 @@ xlog_cil_ctx_alloc(void) { struct xfs_cil_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_NOFS | __GFP_NOFAIL); + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); @@ -1116,11 +1116,18 @@ xlog_cil_cleanup_whiteouts( * same sequence twice. If we get a race between multiple pushes for the same * sequence they will block on the first one and then abort, hence avoiding * needless pushes. + * + * This runs from a workqueue so it does not inherent any specific memory + * allocation context. However, we do not want to block on memory reclaim + * recursing back into the filesystem because this push may have been triggered + * by memory reclaim itself. Hence we really need to run under full GFP_NOFS + * contraints here. */ static void xlog_cil_push_work( struct work_struct *work) { + unsigned int nofs_flags = memalloc_nofs_save(); struct xfs_cil_ctx *ctx = container_of(work, struct xfs_cil_ctx, push_work); struct xfs_cil *cil = ctx->cil; @@ -1334,12 +1341,14 @@ xlog_cil_push_work( spin_unlock(&log->l_icloglock); xlog_cil_cleanup_whiteouts(&whiteouts); xfs_log_ticket_ungrant(log, ticket); + memalloc_nofs_restore(nofs_flags); return; out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); kfree(new_ctx); + memalloc_nofs_restore(nofs_flags); return; out_abort_free_ticket: @@ -1348,6 +1357,7 @@ out_abort_free_ticket: if (!ctx->commit_iclog) { xfs_log_ticket_ungrant(log, ctx->ticket); xlog_cil_committed(ctx); + memalloc_nofs_restore(nofs_flags); return; } spin_lock(&log->l_icloglock); @@ -1356,6 +1366,7 @@ out_abort_free_ticket: /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); xfs_log_ticket_ungrant(log, ticket); + memalloc_nofs_restore(nofs_flags); } /* -- cgit v1.2.3 From 204fae32d5f7b9ac673d3d4f636dcef8697db2f0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:49 +1100 Subject: xfs: clean up remaining GFP_NOFS users These few remaining GFP_NOFS callers do not need to use GFP_NOFS at all. They are only called from a non-transactional context or cannot be accessed from memory reclaim due to other constraints. Hence they can just use GFP_KERNEL. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_btree_staging.c | 4 ++-- fs/xfs/xfs_attr_list.c | 2 +- fs/xfs/xfs_buf.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 961f6b898f4b..f0c69f9bb169 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -139,7 +139,7 @@ xfs_btree_stage_afakeroot( ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); ASSERT(cur->bc_tp == NULL); - nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_NOFS | __GFP_NOFAIL); + nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); nops->alloc_block = xfs_btree_fakeroot_alloc_block; nops->free_block = xfs_btree_fakeroot_free_block; @@ -220,7 +220,7 @@ xfs_btree_stage_ifakeroot( ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_NOFS | __GFP_NOFAIL); + nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); nops->alloc_block = xfs_btree_fakeroot_alloc_block; nops->free_block = xfs_btree_fakeroot_free_block; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 0318d768520a..47453510c0ab 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -109,7 +109,7 @@ xfs_attr_shortform_list( * It didn't all fit, so we have to sort everything on hashval. */ sbsize = sf->count * sizeof(*sbuf); - sbp = sbuf = kmalloc(sbsize, GFP_NOFS | __GFP_NOFAIL); + sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL); /* * Scan the attribute list for the rest of the entries, storing diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ab7546a9dfd7..fb80cae0b2e5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2008,7 +2008,7 @@ xfs_alloc_buftarg( #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; #endif - btp = kzalloc(sizeof(*btp), GFP_NOFS | __GFP_NOFAIL); + btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); btp->bt_mount = mp; btp->bt_bdev_handle = bdev_handle; -- cgit v1.2.3 From 57b98393b812ddaf9cf33a0d57d70b25cabfed66 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 16 Jan 2024 09:59:50 +1100 Subject: xfs: use xfs_defer_alloc a bit more Noticed by inspection, simple factoring allows the same allocation routine to be used for both transaction and recovery contexts. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 8ae4401f6810..6ed3a5fda081 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -819,7 +819,7 @@ xfs_defer_can_append( /* Create a new pending item at the end of the transaction list. */ static inline struct xfs_defer_pending * xfs_defer_alloc( - struct xfs_trans *tp, + struct list_head *dfops, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; @@ -828,7 +828,7 @@ xfs_defer_alloc( GFP_KERNEL | __GFP_NOFAIL); dfp->dfp_ops = ops; INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, &tp->t_dfops); + list_add_tail(&dfp->dfp_list, dfops); return dfp; } @@ -846,7 +846,7 @@ xfs_defer_add( dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) - dfp = xfs_defer_alloc(tp, ops); + dfp = xfs_defer_alloc(&tp->t_dfops, ops); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); @@ -870,7 +870,7 @@ xfs_defer_add_barrier( if (dfp) return; - xfs_defer_alloc(tp, &xfs_barrier_defer_type); + xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type); trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); } @@ -885,14 +885,9 @@ xfs_defer_start_recovery( struct list_head *r_dfops, const struct xfs_defer_op_type *ops) { - struct xfs_defer_pending *dfp; + struct xfs_defer_pending *dfp = xfs_defer_alloc(r_dfops, ops); - dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_KERNEL | __GFP_NOFAIL); - dfp->dfp_ops = ops; dfp->dfp_intent = lip; - INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, r_dfops); } /* -- cgit v1.2.3 From 1149314a16f7972743c66988b2ac19767009325e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 31 Jan 2024 11:47:14 -0800 Subject: xfs: disable sparse inode chunk alignment check when there is no alignment While testing a 64k-blocksize filesystem, I noticed that xfs/709 fails to rebuild the inode btree with a bunch of "Corruption remains" messages. It turns out that when the inode chunk size is smaller than a single filesystem block, no block alignments constraints are necessary for inode chunk allocations, and sb_spino_align is zero. Hence we can skip the check. Fixes: dbfbf3bdf639 ("xfs: repair inode btrees") Signed-off-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/ialloc_repair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c index b3f7182dd2f5..e94f10800082 100644 --- a/fs/xfs/scrub/ialloc_repair.c +++ b/fs/xfs/scrub/ialloc_repair.c @@ -369,7 +369,7 @@ xrep_ibt_check_inode_ext( * On a sparse inode fs, this cluster could be part of a sparse chunk. * Sparse clusters must be aligned to sparse chunk alignment. */ - if (xfs_has_sparseinodes(mp) && + if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align && (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) return -EFSCORRUPTED; -- cgit v1.2.3 From 0164defd0d8665d1579dd802457da5e22d35559f Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Mon, 22 Jan 2024 23:19:02 +0530 Subject: xfs: remove duplicate ifdefs when a ifdef is used in the below manner, second one could be considered as duplicate. ifdef DEFINE_A ...code block... ifdef DEFINE_A ...code block... endif ...code block... endif In the xfs code two such patterns were seen. Hence removing these ifdefs. No functional change is intended here. It only aims to improve code readability. Reviewed-by: "Darrick J. Wong" Signed-off-by: Shrikanth Hegde Signed-off-by: Chandan Babu R --- fs/xfs/xfs_sysfs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 17485666b672..d2391eec37fe 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -193,7 +193,6 @@ always_cow_show( } XFS_SYSFS_ATTR_RW(always_cow); -#ifdef DEBUG /* * Override how many threads the parallel work queue is allowed to create. * This has to be a debug-only global (instead of an errortag) because one of @@ -260,7 +259,6 @@ larp_show( return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); } XFS_SYSFS_ATTR_RW(larp); -#endif /* DEBUG */ STATIC ssize_t bload_leaf_slack_store( @@ -319,10 +317,8 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), ATTR_LIST(always_cow), -#ifdef DEBUG ATTR_LIST(pwork_threads), ATTR_LIST(larp), -#endif ATTR_LIST(bload_leaf_slack), ATTR_LIST(bload_node_slack), NULL, -- cgit v1.2.3 From e4c3b72a6ea93ed9c1815c74312eee9305638852 Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 17 Jan 2024 20:31:26 +0800 Subject: xfs: ensure submit buffers on LSN boundaries in error handlers While performing the IO fault injection test, I caught the following data corruption report: XFS (dm-0): Internal error ltbno + ltlen > bno at line 1957 of file fs/xfs/libxfs/xfs_alloc.c. Caller xfs_free_ag_extent+0x79c/0x1130 CPU: 3 PID: 33 Comm: kworker/3:0 Not tainted 6.5.0-rc7-next-20230825-00001-g7f8666926889 #214 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 Workqueue: xfs-inodegc/dm-0 xfs_inodegc_worker Call Trace: dump_stack_lvl+0x50/0x70 xfs_corruption_error+0x134/0x150 xfs_free_ag_extent+0x7d3/0x1130 __xfs_free_extent+0x201/0x3c0 xfs_trans_free_extent+0x29b/0xa10 xfs_extent_free_finish_item+0x2a/0xb0 xfs_defer_finish_noroll+0x8d1/0x1b40 xfs_defer_finish+0x21/0x200 xfs_itruncate_extents_flags+0x1cb/0x650 xfs_free_eofblocks+0x18f/0x250 xfs_inactive+0x485/0x570 xfs_inodegc_worker+0x207/0x530 process_scheduled_works+0x24a/0xe10 worker_thread+0x5ac/0xc60 kthread+0x2cd/0x3c0 ret_from_fork+0x4a/0x80 ret_from_fork_asm+0x11/0x20 XFS (dm-0): Corruption detected. Unmount and run xfs_repair After analyzing the disk image, it was found that the corruption was triggered by the fact that extent was recorded in both inode datafork and AGF btree blocks. After a long time of reproduction and analysis, we found that the reason of free sapce btree corruption was that the AGF btree was not recovered correctly. Consider the following situation, Checkpoint A and Checkpoint B are in the same record and share the same start LSN1, buf items of same object (AGF btree block) is included in both Checkpoint A and Checkpoint B. If the buf item in Checkpoint A has been recovered and updates metadata LSN permanently, then the buf item in Checkpoint B cannot be recovered, because log recovery skips items with a metadata LSN >= the current LSN of the recovery item. If there is still an inode item in Checkpoint B that records the Extent X, the Extent X will be recorded in both inode datafork and AGF btree block after Checkpoint B is recovered. Such transaction can be seen when allocing enxtent for inode bmap, it record both the addition of extent to the inode extent list and the removing extent from the AGF. |------------Record (LSN1)------------------|---Record (LSN2)---| |-------Checkpoint A----------|----------Checkpoint B-----------| | Buf Item(Extent X) | Buf Item / Inode item(Extent X) | | Extent X is freed | Extent X is allocated | After commit 12818d24db8a ("xfs: rework log recovery to submit buffers on LSN boundaries") was introduced, we submit buffers on lsn boundaries during log recovery. The above problem can be avoided under normal paths, but it's not guaranteed under abnormal paths. Consider the following process, if an error was encountered after recover buf item in Checkpoint A and before recover buf item in Checkpoint B, buffers that have been added to the buffer_list will still be submitted, this violates the submits rule on lsn boundaries. So buf item in Checkpoint B cannot be recovered on the next mount due to current lsn of transaction equal to metadata lsn on disk. The detailed process of the problem is as follows. First Mount: xlog_do_recovery_pass error = xlog_recover_process xlog_recover_process_data xlog_recover_process_ophdr xlog_recovery_process_trans ... /* recover buf item in Checkpoint A */ xlog_recover_buf_commit_pass2 xlog_recover_do_reg_buffer /* add buffer of agf btree block to buffer_list */ xfs_buf_delwri_queue(bp, buffer_list) ... ==> Encounter read IO error and return /* submit buffers regardless of error */ if (!list_empty(&buffer_list)) xfs_buf_delwri_submit(&buffer_list); Second Mount: xlog_do_recovery_pass error = xlog_recover_process xlog_recover_process_data xlog_recover_process_ophdr xlog_recovery_process_trans ... /* recover buf item in Checkpoint B */ xlog_recover_buf_commit_pass2 /* buffer of agf btree block wouldn't added to buffer_list due to lsn equal to current_lsn */ if (XFS_LSN_CMP(lsn, current_lsn) >= 0) goto out_release In order to make sure that submits buffers on lsn boundaries in the abnormal paths, we need to check error status before submit buffers that have been added from the last record processed. If error status exist, buffers in the bufffer_list should not be writen to disk. Canceling the buffers in the buffer_list directly isn't correct, unlike any other place where write list was canceled, these buffers has been initialized by xfs_buf_item_init() during recovery and held by buf item, buf items will not be released in xfs_buf_delwri_cancel(), it's not easy to solve. If the filesystem has been shut down, then delwri list submission will error out all buffers on the list via IO submission/completion and do all the correct cleanup automatically. So shutting down the filesystem could prevents buffers in the bufffer_list from being written to disk. Fixes: 50d5c8d8e938 ("xfs: check LSN ordering for v5 superblocks during recovery") Signed-off-by: Long Li Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log_recover.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8c1d260bb9e1..cd134830a695 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3204,11 +3204,28 @@ xlog_do_recovery_pass( kvfree(hbp); /* - * Submit buffers that have been added from the last record processed, - * regardless of error status. + * Submit buffers that have been dirtied by the last record recovered. */ - if (!list_empty(&buffer_list)) + if (!list_empty(&buffer_list)) { + if (error) { + /* + * If there has been an item recovery error then we + * cannot allow partial checkpoint writeback to + * occur. We might have multiple checkpoints with the + * same start LSN in this buffer list, and partial + * writeback of a checkpoint in this situation can + * prevent future recovery of all the changes in the + * checkpoints at this start LSN. + * + * Note: Shutting down the filesystem will result in the + * delwri submission marking all the buffers stale, + * completing them and cleaning up _XBF_LOGRECOVERY + * state without doing any IO. + */ + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + } error2 = xfs_buf_delwri_submit(&buffer_list); + } if (error && first_bad) *first_bad = rhead_blk; -- cgit v1.2.3 From 49c379d3a72ab86aafeafebe6b43577acb1ef359 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Feb 2024 18:02:30 +0100 Subject: xfs: use kvfree for buf in xfs_ioc_getbmap Without this the kernel crashes in kfree for files with a sufficiently large number of extents. Fixes: d4c75a1b40cd ("xfs: convert remaining kmem_free() to kfree()") Signed-off-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Reviewed-by: Chaitanya Kulkarni Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7eeebcb6b925..7c35d7644097 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1506,7 +1506,7 @@ xfs_ioc_getbmap( error = 0; out_free_buf: - kfree(buf); + kvfree(buf); return error; } -- cgit v1.2.3 From f70405afc99b1e5a3a1e60b6c05456fde2dbe622 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 19 Feb 2024 15:41:11 +0000 Subject: locking: Add rwsem_assert_held() and rwsem_assert_held_write() Modelled after lockdep_assert_held() and lockdep_assert_held_write(), but are always active, even when lockdep is disabled. Of course, they don't test that _this_ thread is the owner, but it's sufficient to catch many bugs and doesn't incur the same performance penalty as lockdep. Acked-by: "Peter Zijlstra (Intel)" Acked-by: Waiman Long Acked-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- include/linux/rwbase_rt.h | 9 +++++++-- include/linux/rwsem.h | 46 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/include/linux/rwbase_rt.h b/include/linux/rwbase_rt.h index 1d264dd08625..29c4e4f243e4 100644 --- a/include/linux/rwbase_rt.h +++ b/include/linux/rwbase_rt.h @@ -26,12 +26,17 @@ struct rwbase_rt { } while (0) -static __always_inline bool rw_base_is_locked(struct rwbase_rt *rwb) +static __always_inline bool rw_base_is_locked(const struct rwbase_rt *rwb) { return atomic_read(&rwb->readers) != READER_BIAS; } -static __always_inline bool rw_base_is_contended(struct rwbase_rt *rwb) +static inline void rw_base_assert_held_write(const struct rwbase_rt *rwb) +{ + WARN_ON(atomic_read(&rwb->readers) != WRITER_BIAS); +} + +static __always_inline bool rw_base_is_contended(const struct rwbase_rt *rwb) { return atomic_read(&rwb->readers) > 0; } diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9c29689ff505..4f1c18992f76 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -66,14 +66,24 @@ struct rw_semaphore { #endif }; -/* In all implementations count != 0 means locked */ +#define RWSEM_UNLOCKED_VALUE 0UL +#define RWSEM_WRITER_LOCKED (1UL << 0) +#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) + static inline int rwsem_is_locked(struct rw_semaphore *sem) { - return atomic_long_read(&sem->count) != 0; + return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE; } -#define RWSEM_UNLOCKED_VALUE 0L -#define __RWSEM_COUNT_INIT(name) .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE) +static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE); +} + +static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED)); +} /* Common initializer macros and functions */ @@ -152,11 +162,21 @@ do { \ __init_rwsem((sem), #sem, &__key); \ } while (0) -static __always_inline int rwsem_is_locked(struct rw_semaphore *sem) +static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem) { return rw_base_is_locked(&sem->rwbase); } +static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem) +{ + WARN_ON(!rwsem_is_locked(sem)); +} + +static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem) +{ + rw_base_assert_held_write(sem); +} + static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) { return rw_base_is_contended(&sem->rwbase); @@ -169,6 +189,22 @@ static __always_inline int rwsem_is_contended(struct rw_semaphore *sem) * the RT specific variant. */ +static inline void rwsem_assert_held(const struct rw_semaphore *sem) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + lockdep_assert_held(sem); + else + rwsem_assert_held_nolockdep(sem); +} + +static inline void rwsem_assert_held_write(const struct rw_semaphore *sem) +{ + if (IS_ENABLED(CONFIG_LOCKDEP)) + lockdep_assert_held_write(sem); + else + rwsem_assert_held_write_nolockdep(sem); +} + /* * lock for reading */ -- cgit v1.2.3 From 3fed24fffc76dd1a8105db558e98bc8355d60379 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 19 Feb 2024 15:41:12 +0000 Subject: xfs: Replace xfs_isilocked with xfs_assert_ilocked To use the new rwsem_assert_held()/rwsem_assert_held_write(), we can't use the existing ASSERT macro. Add a new xfs_assert_ilocked() and convert all the callers. Fix an apparent bug in xfs_isilocked(): If the caller specifies XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL, xfs_assert_ilocked() will check both the IOLOCK and the ILOCK are held for write. xfs_isilocked() only checked that the ILOCK was held for write. xfs_assert_ilocked() is always on, even if DEBUG or XFS_WARN aren't defined. It's a cheap check, so I don't think it's worth defining it away. Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 2 +- fs/xfs/libxfs/xfs_attr_remote.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 21 ++++++------- fs/xfs/libxfs/xfs_defer.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.c | 2 +- fs/xfs/libxfs/xfs_rtbitmap.c | 2 +- fs/xfs/libxfs/xfs_trans_inode.c | 6 ++-- fs/xfs/scrub/readdir.c | 4 +-- fs/xfs/xfs_attr_list.c | 2 +- fs/xfs/xfs_bmap_util.c | 10 +++--- fs/xfs/xfs_dir2_readdir.c | 2 +- fs/xfs/xfs_dquot.c | 4 +-- fs/xfs/xfs_file.c | 4 +-- fs/xfs/xfs_inode.c | 68 +++++++++++++---------------------------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_inode_item.c | 4 +-- fs/xfs/xfs_iops.c | 3 +- fs/xfs/xfs_qm.c | 10 +++--- fs/xfs/xfs_reflink.c | 2 +- fs/xfs/xfs_rtalloc.c | 2 +- fs/xfs/xfs_symlink.c | 2 +- fs/xfs/xfs_trans.c | 2 +- fs/xfs/xfs_trans_dquot.c | 2 +- 23 files changed, 65 insertions(+), 95 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 82ab559f1da1..673a4b6d2e8d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -224,7 +224,7 @@ int xfs_attr_get_ilocked( struct xfs_da_args *args) { - ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index d440393b40eb..1c007ebf153a 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -545,7 +545,7 @@ xfs_attr_rmtval_stale( struct xfs_buf *bp; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b525524a2da4..68ab22a2a453 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1189,7 +1189,7 @@ xfs_iread_extents( if (!xfs_need_iread_extents(ifp)) return 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ir.loaded = 0; xfs_iext_first(ifp, &ir.icur); @@ -3898,7 +3898,7 @@ xfs_bmapi_read( ASSERT(*nmap >= 1); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (WARN_ON_ONCE(!ifp)) return -EFSCORRUPTED; @@ -4369,7 +4369,7 @@ xfs_bmapi_write( ASSERT(tp != NULL); ASSERT(len > 0); ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & XFS_BMAPI_REMAP)); /* zeroing is for currently only for data extents, not metadata */ @@ -4666,7 +4666,7 @@ xfs_bmapi_remap( ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | XFS_BMAPI_NORMAP))); ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != @@ -5291,7 +5291,7 @@ __xfs_bunmapi( if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(len > 0); ASSERT(nexts >= 0); @@ -5635,8 +5635,7 @@ xfs_bmse_merge( blockcount = left->br_blockcount + got->br_blockcount; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); ASSERT(xfs_bmse_can_merge(left, got, shift)); new = *left; @@ -5764,7 +5763,7 @@ xfs_bmap_collapse_extents( if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) @@ -5837,7 +5836,7 @@ xfs_bmap_can_insert_extents( int is_empty; int error = 0; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); if (xfs_is_shutdown(ip->i_mount)) return -EIO; @@ -5879,7 +5878,7 @@ xfs_bmap_insert_extents( if (xfs_is_shutdown(mp)) return -EIO; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); error = xfs_iread_extents(tp, ip, whichfork); if (error) @@ -6257,7 +6256,7 @@ xfs_bunmapi_range( xfs_filblks_t unmap_len = endoff - startoff + 1; int error = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); while (unmap_len > 0) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 6ed3a5fda081..c13276095cc0 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -1006,7 +1006,7 @@ xfs_defer_ops_capture( * transaction. */ for (i = 0; i < dfc->dfc_held.dr_inos; i++) { - ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL); ihold(VFS_I(dfc->dfc_held.dr_ip[i])); } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 136d5d7b9de9..60758f75182b 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -565,7 +565,7 @@ xfs_iextents_copy( struct xfs_bmbt_irec rec; int64_t copied = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); ASSERT(ifp->if_bytes > 0); for_each_xfs_iext(ifp, &icur, &rec) { diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e31663cb7b43..1f6e05fc359b 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -934,7 +934,7 @@ xfs_rtfree_extent( struct timespec64 atime; ASSERT(mp->m_rbmip->i_itemp != NULL); - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); error = xfs_rtcheck_alloc_range(&args, start, len); if (error) diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 70e97ea6eee7..69fc5b981352 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -31,7 +31,7 @@ xfs_trans_ijoin( { struct xfs_inode_log_item *iip; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; @@ -60,7 +60,7 @@ xfs_trans_ichgtime( struct timespec64 tv; ASSERT(tp); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); tv = current_time(inode); @@ -90,7 +90,7 @@ xfs_trans_log_inode( struct inode *inode = VFS_I(ip); ASSERT(iip); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); tp->t_flags |= XFS_TRANS_DIRTY; diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index 16462332c897..dfdcb96b6c16 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -281,7 +281,7 @@ xchk_dir_walk( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) return xchk_dir_walk_sf(sc, dp, dirent_fn, priv); @@ -332,7 +332,7 @@ xchk_dir_lookup( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) { error = xfs_dir2_sf_lookup(&args); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 47453510c0ab..56021403f447 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -504,7 +504,7 @@ xfs_attr_list_ilocked( { struct xfs_inode *dp = context->dp; - ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); /* * Decide on what work routines to call based on the inode size. diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index cb2a4b940292..2fe306fd71fc 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -508,8 +508,8 @@ xfs_can_free_eofblocks( * Caller must either hold the exclusive io lock; or be inactivating * the inode, which guarantees there are no other users of the inode. */ - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) || - (VFS_I(ip)->i_state & I_FREEING)); + if (!(VFS_I(ip)->i_state & I_FREEING)) + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); /* prealloc/delalloc exists only on regular files */ if (!S_ISREG(VFS_I(ip)->i_mode)) @@ -965,8 +965,7 @@ xfs_collapse_file_space( xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); bool done = false; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); trace_xfs_collapse_file_space(ip); @@ -1035,8 +1034,7 @@ xfs_insert_file_space( xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len); bool done = false; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); trace_xfs_insert_file_space(ip); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index cc6dc56f455d..e82dd5d65cde 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -522,7 +522,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); - ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index e38fe4c8f25b..308b5f33b712 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -950,7 +950,7 @@ xfs_qm_dqget_inode( if (error) return error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(xfs_inode_dquot(ip, type) == NULL); id = xfs_qm_id_for_quotatype(ip, type); @@ -1007,7 +1007,7 @@ restart: } dqret: - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); trace_xfs_dqget_miss(dqp); *O_dqpp = dqp; return 0; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e33e5e13b95f..632653e00906 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -879,7 +879,7 @@ xfs_break_dax_layouts( { struct page *page; - ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); page = dax_layout_busy_page(inode->i_mapping); if (!page) @@ -900,7 +900,7 @@ xfs_break_layouts( bool retry; int error; - ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); do { retry = false; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 37ec247edc13..634df4f8484d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -328,52 +328,26 @@ xfs_ilock_demote( trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } -#if defined(DEBUG) || defined(XFS_WARN) -static inline bool -__xfs_rwsem_islocked( - struct rw_semaphore *rwsem, - bool shared) -{ - if (!debug_locks) - return rwsem_is_locked(rwsem); - - if (!shared) - return lockdep_is_held_type(rwsem, 0); - - /* - * We are checking that the lock is held at least in shared - * mode but don't care that it might be held exclusively - * (i.e. shared | excl). Hence we check if the lock is held - * in any mode rather than an explicit shared mode. - */ - return lockdep_is_held_type(rwsem, -1); -} - -bool -xfs_isilocked( +void +xfs_assert_ilocked( struct xfs_inode *ip, uint lock_flags) { - if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { - if (!(lock_flags & XFS_ILOCK_SHARED)) - return !!ip->i_lock.mr_writer; - return rwsem_is_locked(&ip->i_lock.mr_lock); - } + if (lock_flags & XFS_ILOCK_SHARED) + rwsem_assert_held(&ip->i_lock.mr_lock); + else if (lock_flags & XFS_ILOCK_EXCL) + ASSERT(ip->i_lock.mr_writer); - if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock, - (lock_flags & XFS_MMAPLOCK_SHARED)); - } + if (lock_flags & XFS_MMAPLOCK_SHARED) + rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock); + else if (lock_flags & XFS_MMAPLOCK_EXCL) + rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock); - if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { - return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, - (lock_flags & XFS_IOLOCK_SHARED)); - } - - ASSERT(0); - return false; + if (lock_flags & XFS_IOLOCK_SHARED) + rwsem_assert_held(&VFS_I(ip)->i_rwsem); + else if (lock_flags & XFS_IOLOCK_EXCL) + rwsem_assert_held_write(&VFS_I(ip)->i_rwsem); } -#endif /* * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when @@ -1342,9 +1316,9 @@ xfs_itruncate_extents_flags( xfs_fileoff_t first_unmap_block; int error = 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(!atomic_read(&VFS_I(ip)->i_count) || - xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); + if (atomic_read(&VFS_I(ip)->i_count)) + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); ASSERT(new_size <= XFS_ISIZE(ip)); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(ip->i_itemp != NULL); @@ -1596,7 +1570,7 @@ xfs_inactive_ifree( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); error = xfs_ifree(tp, ip); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -2350,7 +2324,7 @@ xfs_ifree( struct xfs_inode_log_item *iip = ip->i_itemp; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(ip->i_df.if_nextents == 0); ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); @@ -2419,7 +2393,7 @@ static void xfs_iunpin( struct xfs_inode *ip) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); trace_xfs_inode_unpin_nowait(ip, _RET_IP_); @@ -3182,7 +3156,7 @@ xfs_iflush( struct xfs_mount *mp = ip->i_mount; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 97f63bacd4c2..dcc818901a79 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -523,7 +523,7 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -bool xfs_isilocked(struct xfs_inode *, uint); +void xfs_assert_ilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index bfbeafc8e120..f28d653300d1 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -650,7 +650,7 @@ xfs_inode_item_pin( { struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(lip->li_buf); trace_xfs_inode_pin(ip, _RET_IP_); @@ -756,7 +756,7 @@ xfs_inode_item_release( unsigned short lock_flags; ASSERT(ip->i_itemp != NULL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); lock_flags = iip->ili_lock_flags; iip->ili_lock_flags = 0; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index be102fd49560..12510d20813b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -796,8 +796,7 @@ xfs_setattr_size( uint lock_flags = 0; bool did_zeroing = false; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5bcd67cae16a..b5b555698ae1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -254,7 +254,7 @@ xfs_qm_dqattach_one( struct xfs_dquot *dqp; int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); error = 0; /* @@ -322,7 +322,7 @@ xfs_qm_dqattach_locked( if (!xfs_qm_need_dqattach(ip)) return 0; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, @@ -353,7 +353,7 @@ done: * Don't worry about the dquots that we may have attached before any * error - they'll get detached later if it has not already been done. */ - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); return error; } @@ -1811,7 +1811,7 @@ xfs_qm_vop_chown( XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); ASSERT(XFS_IS_QUOTA_ON(ip->i_mount)); /* old dquot */ @@ -1899,7 +1899,7 @@ xfs_qm_vop_create_dqattach( if (!XFS_IS_QUOTA_ON(mp)) return; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (udqp && XFS_IS_UQUOTA_ON(mp)) { ASSERT(ip->i_udquot == NULL); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d5ca8bcae65b..e64ef2a293b6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -527,7 +527,7 @@ xfs_reflink_allocate_cow( int error; bool found; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (!ip->i_cowfp) { ASSERT(!xfs_is_reflink_inode(ip)); xfs_ifork_init_cow(ip); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 2f85567f3d75..b62b5c34413e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1260,7 +1260,7 @@ xfs_rtpick_extent( uint64_t seq; /* sequence number of file creation */ struct timespec64 ts; /* timespec in inode */ - ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL); ts = inode_get_atime(VFS_I(mp->m_rbmip)); if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 92974a4414c8..c2dc8c501bdc 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -44,7 +44,7 @@ xfs_readlink_bmap_ilocked( int fsblocks = 0; int offset; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); fsblocks = xfs_symlink_blocks(mp, pathlen); error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 12d45e93f07d..7350640059cc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1273,7 +1273,7 @@ xfs_trans_reserve_more_inode( unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); int error; - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve(tp, &resv, dblocks, rtx); if (error) diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index aa00cf67ad72..9c159d016ecf 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -796,7 +796,7 @@ xfs_trans_reserve_quota_nblks( return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (force) qflags |= XFS_QMOPT_FORCE_RES; -- cgit v1.2.3 From 785dd131525060c87994ff077ea3376f05c98304 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 19 Feb 2024 15:41:13 +0000 Subject: xfs: Remove mrlock wrapper mrlock was an rwsem wrapper that also recorded whether the lock was held for read or write. Now that we can ask the generic code whether the lock is held for read or write, we can remove this wrapper and use an rwsem directly. As the comment says, we can't use lockdep to assert that the ILOCK is held for write, because we might be in a workqueue, and we aren't able to tell lockdep that we do in fact own the lock. Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- fs/xfs/mrlock.h | 78 ------------------------------------------------------ fs/xfs/xfs_inode.c | 22 ++++++++------- fs/xfs/xfs_inode.h | 2 +- fs/xfs/xfs_iops.c | 4 +-- fs/xfs/xfs_linux.h | 2 +- fs/xfs/xfs_super.c | 4 +-- 6 files changed, 18 insertions(+), 94 deletions(-) delete mode 100644 fs/xfs/mrlock.h diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h deleted file mode 100644 index 79155eec341b..000000000000 --- a/fs/xfs/mrlock.h +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - */ -#ifndef __XFS_SUPPORT_MRLOCK_H__ -#define __XFS_SUPPORT_MRLOCK_H__ - -#include - -typedef struct { - struct rw_semaphore mr_lock; -#if defined(DEBUG) || defined(XFS_WARN) - int mr_writer; -#endif -} mrlock_t; - -#if defined(DEBUG) || defined(XFS_WARN) -#define mrinit(mrp, name) \ - do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) -#else -#define mrinit(mrp, name) \ - do { init_rwsem(&(mrp)->mr_lock); } while (0) -#endif - -#define mrlock_init(mrp, t,n,s) mrinit(mrp, n) -#define mrfree(mrp) do { } while (0) - -static inline void mraccess_nested(mrlock_t *mrp, int subclass) -{ - down_read_nested(&mrp->mr_lock, subclass); -} - -static inline void mrupdate_nested(mrlock_t *mrp, int subclass) -{ - down_write_nested(&mrp->mr_lock, subclass); -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 1; -#endif -} - -static inline int mrtryaccess(mrlock_t *mrp) -{ - return down_read_trylock(&mrp->mr_lock); -} - -static inline int mrtryupdate(mrlock_t *mrp) -{ - if (!down_write_trylock(&mrp->mr_lock)) - return 0; -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 1; -#endif - return 1; -} - -static inline void mrunlock_excl(mrlock_t *mrp) -{ -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 0; -#endif - up_write(&mrp->mr_lock); -} - -static inline void mrunlock_shared(mrlock_t *mrp) -{ - up_read(&mrp->mr_lock); -} - -static inline void mrdemote(mrlock_t *mrp) -{ -#if defined(DEBUG) || defined(XFS_WARN) - mrp->mr_writer = 0; -#endif - downgrade_write(&mrp->mr_lock); -} - -#endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 634df4f8484d..110077ca3d2a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -203,9 +203,9 @@ xfs_ilock( } if (lock_flags & XFS_ILOCK_EXCL) - mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) - mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); } /* @@ -246,10 +246,10 @@ xfs_ilock_nowait( } if (lock_flags & XFS_ILOCK_EXCL) { - if (!mrtryupdate(&ip->i_lock)) + if (!down_write_trylock(&ip->i_lock)) goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { - if (!mrtryaccess(&ip->i_lock)) + if (!down_read_trylock(&ip->i_lock)) goto out_undo_mmaplock; } return 1; @@ -298,9 +298,9 @@ xfs_iunlock( up_read(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_ILOCK_EXCL) - mrunlock_excl(&ip->i_lock); + up_write(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) - mrunlock_shared(&ip->i_lock); + up_read(&ip->i_lock); trace_xfs_iunlock(ip, lock_flags, _RET_IP_); } @@ -319,7 +319,7 @@ xfs_ilock_demote( ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) - mrdemote(&ip->i_lock); + downgrade_write(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_EXCL) downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_IOLOCK_EXCL) @@ -333,10 +333,14 @@ xfs_assert_ilocked( struct xfs_inode *ip, uint lock_flags) { + /* + * Sometimes we assert the ILOCK is held exclusively, but we're in + * a workqueue, so lockdep doesn't know we're the owner. + */ if (lock_flags & XFS_ILOCK_SHARED) - rwsem_assert_held(&ip->i_lock.mr_lock); + rwsem_assert_held(&ip->i_lock); else if (lock_flags & XFS_ILOCK_EXCL) - ASSERT(ip->i_lock.mr_writer); + rwsem_assert_held_write_nolockdep(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_SHARED) rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index dcc818901a79..796d11065fe2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -39,7 +39,7 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ - mrlock_t i_lock; /* inode lock */ + struct rw_semaphore i_lock; /* inode lock */ atomic_t i_pincount; /* inode pin count */ struct llist_node i_gclist; /* deferred inactivation list */ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 12510d20813b..66f8c47642e8 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1284,9 +1284,9 @@ xfs_setup_inode( */ lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_dir_key); - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); + lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class); } else { - lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); + lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class); } /* diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index caccb7f76690..75245932e0ad 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -20,7 +20,6 @@ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; #include "xfs_types.h" -#include "mrlock.h" #include #include @@ -50,6 +49,7 @@ typedef __u32 xfs_nlink_t; #include #include #include +#include #include #include #include diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 6ce1e6deb7ec..b31652fa7004 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -716,9 +716,7 @@ xfs_fs_inode_init_once( /* xfs inode */ atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - - mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); + init_rwsem(&ip->i_lock); } /* -- cgit v1.2.3 From 661723c3bdaffa7d970a59987e37026bc5ed5657 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 20 Feb 2024 11:09:23 +1100 Subject: xfs: use kvfree() in xfs_ioc_attr_list() Wrongly converted from kmem_free() to kfree(). Reported-by: Matthew Wilcox Fixes: 49292576136f ("xfs: convert kmem_free() for kvmalloc users to kvfree()") Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7c35d7644097..9d6a75262045 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -435,7 +435,7 @@ xfs_ioc_attr_list( copy_to_user(ucursor, &context.cursor, sizeof(context.cursor))) error = -EFAULT; out_free: - kfree(buffer); + kvfree(buffer); return error; } -- cgit v1.2.3 From 7d5ba7ca6a4525b8908c285c2949bb946dfbbc54 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 20 Feb 2024 11:24:41 +1100 Subject: xfs: use kvfree in xfs_ioc_getfsmap() Another incorrect conversion to kfree() instead of kvfree(). Fixes: 49292576136f ("xfs: convert kmem_free() for kvmalloc users to kvfree()") Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 9d6a75262045..d0e2cec6210d 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1636,7 +1636,7 @@ xfs_ioc_getfsmap( } out_free: - kfree(recs); + kvfree(recs); return error; } -- cgit v1.2.3 From b64e74e95aa6491b31477e9002aab1d8df3995bf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:09 +0100 Subject: mm: move mapping_set_update out of mapping_set_update is only used inside mm/. Move mapping_set_update to mm/internal.h and turn it into an inline function instead of a macro. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- include/linux/swap.h | 10 ---------- mm/filemap.c | 9 +++++++++ mm/internal.h | 4 ++++ mm/workingset.c | 1 + 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 4db00ddad261..755fc64ba48d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -350,16 +350,6 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); void workingset_refault(struct folio *folio, void *shadow); void workingset_activation(struct folio *folio); -/* Only track the nodes of mappings with shadow entries */ -void workingset_update_node(struct xa_node *node); -extern struct list_lru shadow_nodes; -#define mapping_set_update(xas, mapping) do { \ - if (!dax_mapping(mapping) && !shmem_mapping(mapping)) { \ - xas_set_update(xas, workingset_update_node); \ - xas_set_lru(xas, &shadow_nodes); \ - } \ -} while (0) - /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db..6c8b089f00d2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -124,6 +124,15 @@ * ->private_lock (zap_pte_range->block_dirty_folio) */ +static void mapping_set_update(struct xa_state *xas, + struct address_space *mapping) +{ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + xas_set_update(xas, workingset_update_node); + xas_set_lru(xas, &shadow_nodes); +} + static void page_cache_delete(struct address_space *mapping, struct folio *folio, void *shadow) { diff --git a/mm/internal.h b/mm/internal.h index f309a010d50f..4398f572485f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1266,4 +1266,8 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, } #endif /* CONFIG_SHRINKER_DEBUG */ +/* Only track the nodes of mappings with shadow entries */ +void workingset_update_node(struct xa_node *node); +extern struct list_lru shadow_nodes; + #endif /* __MM_INTERNAL_H */ diff --git a/mm/workingset.c b/mm/workingset.c index 226012974328..f2a0ecaf708d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -16,6 +16,7 @@ #include #include #include +#include "internal.h" /* * Double CLOCK lists -- cgit v1.2.3 From aefacb2041f77784059b86c5fd151066859ad19a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:10 +0100 Subject: shmem: move shmem_mapping out of line shmem_aops really should not be exported to the world. Move shmem_mapping and export it as internal for the one semi-legitimate modular user in udmabuf. This effectively reverts commit 30e6a51dbb05 ("mm/shmem.c: make shmem_mapping() inline"). which added a bogus shmem_aops non-GPL export for no reason whatsoever as there as no shmem_mapping call outside of core MM code at that point. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- include/linux/shmem_fs.h | 6 +----- mm/shmem.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 2caa6b86106a..6b96a87e4bc8 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -97,11 +97,7 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM -extern const struct address_space_operations shmem_aops; -static inline bool shmem_mapping(struct address_space *mapping) -{ - return mapping->a_ops == &shmem_aops; -} +bool shmem_mapping(struct address_space *mapping); #else static inline bool shmem_mapping(struct address_space *mapping) { diff --git a/mm/shmem.c b/mm/shmem.c index d7c84ff62186..f607b0cab7e4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -254,7 +254,7 @@ static void shmem_inode_unacct_blocks(struct inode *inode, long pages) } static const struct super_operations shmem_ops; -const struct address_space_operations shmem_aops; +static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -263,6 +263,12 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->a_ops == &shmem_aops; +} +EXPORT_SYMBOL_GPL(shmem_mapping); + bool vma_is_anon_shmem(struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; @@ -4466,7 +4472,7 @@ static int shmem_error_remove_folio(struct address_space *mapping, return 0; } -const struct address_space_operations shmem_aops = { +static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS @@ -4478,7 +4484,6 @@ const struct address_space_operations shmem_aops = { #endif .error_remove_folio = shmem_error_remove_folio, }; -EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, -- cgit v1.2.3 From e11381d83d72198565f4545d9988b4720288eb64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:11 +0100 Subject: shmem: set a_ops earlier in shmem_symlink Set the a_ops in shmem_symlink before reading a folio from the mapping to prepare for asserting that shmem_get_folio is only called on shmem mappings. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Signed-off-by: Chandan Babu R --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index f607b0cab7e4..1900916aa84d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3506,10 +3506,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); + inode->i_mapping->a_ops = &shmem_aops; error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) goto out_remove_offset; - inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); folio_mark_uptodate(folio); -- cgit v1.2.3 From 1cd81faaf61b42307e81f2dd173934005c220a64 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:12 +0100 Subject: shmem: move the shmem_mapping assert into shmem_get_folio_gfp Move the check that the inode really is a shmemfs one from shmem_read_folio_gfp to shmem_get_folio_gfp given that shmem_get_folio can also be called from outside of shmem.c. Also turn it into a WARN_ON_ONCE and error return instead of BUG_ON to be less severe. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- mm/shmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1900916aa84d..ad533b2f0721 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1972,6 +1972,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, int error; bool alloced; + if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping))) + return -EINVAL; + if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: @@ -4915,7 +4918,6 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, struct folio *folio; int error; - BUG_ON(!shmem_mapping(mapping)); error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL); if (error) -- cgit v1.2.3 From d7468609ee0f90ceb24143a32edc47d433d1dbba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:13 +0100 Subject: shmem: export shmem_get_folio Export shmem_get_folio as a slightly lower-level variant of shmem_read_folio_gfp. This will be useful for XFS xfile use cases that want to pass SGP_NOALLOC or get a locked page, which the thin shmem_read_folio_gfp wrapper can't provide. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- mm/shmem.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index ad533b2f0721..aeb1fd19ea3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2137,12 +2137,32 @@ unlock: return error; } +/** + * shmem_get_folio - find, and lock a shmem folio. + * @inode: inode to search + * @index: the page index. + * @foliop: pointer to the folio if found + * @sgp: SGP_* flags to control behavior + * + * Looks up the page cache entry at @inode & @index. If a folio is + * present, it is returned locked with an increased refcount. + * + * When no folio is found, the behavior depends on @sgp: + * - for SGP_READ, *foliop is %NULL and 0 is returned + * - for SGP_NOALLOC, *foliop is %NULL and -ENOENT is returned + * - for all other flags a new folio is allocated, inserted into the + * page cache and returned locked in @foliop. + * + * Context: May sleep. + * Return: 0 if successful, else a negative error code. + */ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, foliop, sgp, mapping_gfp_mask(inode->i_mapping), NULL, NULL); } +EXPORT_SYMBOL_GPL(shmem_get_folio); /* * This is like autoremove_wake_function, but it removes the wait queue -- cgit v1.2.3 From be9d93661d548a41bb8f135b9953191f0fb2e44b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:14 +0100 Subject: shmem: export shmem_kernel_file_setup XFS wants to use this for it's internal in-memory data structures and currently duplicates the functionality. Export shmem_kernel_file_setup to allow XFS to switch over to using the proper kernel API. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/shmem.c b/mm/shmem.c index aeb1fd19ea3f..95e70e9ea060 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4861,6 +4861,7 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon { return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE); } +EXPORT_SYMBOL_GPL(shmem_kernel_file_setup); /** * shmem_file_setup - get an unlinked file living in tmpfs -- cgit v1.2.3 From 9d8b36744935f83c5553e6f242b9961f676628ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:15 +0100 Subject: shmem: document how to "persist" data when using shmem_*file_setup Add a blurb that simply dirtying the folio will persist data for in-kernel shmem files. This is what most of the callers already do. Signed-off-by: Christoph Hellwig Reviewed-by: "Matthew Wilcox (Oracle)" Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- mm/shmem.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 95e70e9ea060..fb76da93d369 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2147,6 +2147,10 @@ unlock: * Looks up the page cache entry at @inode & @index. If a folio is * present, it is returned locked with an increased refcount. * + * If the caller modifies data in the folio, it must call folio_mark_dirty() + * before unlocking the folio to ensure that the folio is not reclaimed. + * There is no need to reserve space before calling folio_mark_dirty(). + * * When no folio is found, the behavior depends on @sgp: * - for SGP_READ, *foliop is %NULL and 0 is returned * - for SGP_NOALLOC, *foliop is %NULL and -ENOENT is returned -- cgit v1.2.3 From b44c0eb8ae9c3e22cfa113774134673ac18ac848 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:16 +0100 Subject: xfs: use VM_NORESERVE in xfile_create xfile_create creates a (potentially large) sparse file. Pass VM_NORESERVE to shmem_file_setup to not account for the entire file size at file creation time. Reported-by: Hugh Dickins Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 090c3ead43fd..1cf4b239bdbb 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -68,7 +68,7 @@ xfile_create( if (!xf) return -ENOMEM; - xf->file = shmem_file_setup(description, isize, 0); + xf->file = shmem_file_setup(description, isize, VM_NORESERVE); if (!xf->file) goto out_xfile; if (IS_ERR(xf->file)) { -- cgit v1.2.3 From 1b07ea2ab3dc0307f80c735cba8c3ef690bd9aab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:17 +0100 Subject: xfs: shmem_file_setup can't return NULL shmem_file_setup always returns a struct file pointer or an ERR_PTR, so remove the code to check for a NULL return. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 1cf4b239bdbb..e649558351bc 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -62,15 +62,13 @@ xfile_create( { struct inode *inode; struct xfile *xf; - int error = -ENOMEM; + int error; xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS); if (!xf) return -ENOMEM; xf->file = shmem_file_setup(description, isize, VM_NORESERVE); - if (!xf->file) - goto out_xfile; if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_xfile; -- cgit v1.2.3 From efc9dc096399c692bd258ec9cdbe6b868a59545a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:18 +0100 Subject: xfs: use shmem_kernel_file_setup in xfile_create shmem_kernel_file_setup is equivalent to shmem_file_setup except that it already sets the S_PRIVATE flag. Use it instead of open coding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index e649558351bc..99a2b48f5662 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -68,7 +68,7 @@ xfile_create( if (!xf) return -ENOMEM; - xf->file = shmem_file_setup(description, isize, VM_NORESERVE); + xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE); if (IS_ERR(xf->file)) { error = PTR_ERR(xf->file); goto out_xfile; @@ -85,7 +85,7 @@ xfile_create( FMODE_LSEEK; xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; inode = file_inode(xf->file); - inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME; + inode->i_flags |= S_NOCMTIME | S_NOATIME; inode->i_mode &= ~0177; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; -- cgit v1.2.3 From a2078df025d92046cc98a9fce65220abd864781e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:19 +0100 Subject: xfs: don't modify file and inode flags for shmem files shmem_file_setup is explicitly intended for a file that can be fully read and written by kernel users without restrictions. Don't poke into internals to change random flags in the file or inode. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 99a2b48f5662..95250db81981 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -74,22 +74,7 @@ xfile_create( goto out_xfile; } - /* - * We want a large sparse file that we can pread, pwrite, and seek. - * xfile users are responsible for keeping the xfile hidden away from - * all other callers, so we skip timestamp updates and security checks. - * Make the inode only accessible by root, just in case the xfile ever - * escapes. - */ - xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME | - FMODE_LSEEK; - xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME; inode = file_inode(xf->file); - inode->i_flags |= S_NOCMTIME | S_NOATIME; - inode->i_mode &= ~0177; - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); trace_xfile_create(xf); -- cgit v1.2.3 From 0473635d46e2344eed897dcfca12aa4688285262 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:20 +0100 Subject: xfs: remove xfile_stat vfs_getattr is needed to query inode attributes for unknown underlying file systems. But shmemfs is well known for users of shmem_file_setup and shmem_read_mapping_page_gfp that rely on it not needing specific inode revalidation and having a normal mapping. Remove the detour through the getattr method and an extra wrapper, and just read the inode size and i_bytes directly in the scrub tracing code. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/trace.h | 34 ++++++++++------------------------ fs/xfs/scrub/xfile.c | 19 ------------------- fs/xfs/scrub/xfile.h | 7 ------- 3 files changed, 10 insertions(+), 50 deletions(-) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 6bbb4e8639dc..260b8fe0a802 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -861,18 +861,11 @@ TRACE_EVENT(xfile_destroy, __field(loff_t, size) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; - - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + struct inode *inode = file_inode(xf->file); + + __entry->ino = inode->i_ino; + __entry->bytes = inode->i_blocks << SECTOR_SHIFT; + __entry->size = i_size_read(inode); ), TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx", __entry->ino, @@ -891,19 +884,12 @@ DECLARE_EVENT_CLASS(xfile_class, __field(unsigned long long, bytecount) ), TP_fast_assign( - struct xfile_stat statbuf; - int ret; - - ret = xfile_stat(xf, &statbuf); - if (!ret) { - __entry->bytes_used = statbuf.bytes; - __entry->size = statbuf.size; - } else { - __entry->bytes_used = -1; - __entry->size = -1; - } - __entry->ino = file_inode(xf->file)->i_ino; + struct inode *inode = file_inode(xf->file); + + __entry->ino = inode->i_ino; + __entry->bytes_used = inode->i_blocks << SECTOR_SHIFT; __entry->pos = pos; + __entry->size = i_size_read(inode); __entry->bytecount = bytecount; ), TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx", diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 95250db81981..b212d4a9c780 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -274,25 +274,6 @@ xfile_seek_data( return ret; } -/* Query stat information for an xfile. */ -int -xfile_stat( - struct xfile *xf, - struct xfile_stat *statbuf) -{ - struct kstat ks; - int error; - - error = vfs_getattr_nosec(&xf->file->f_path, &ks, - STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC); - if (error) - return error; - - statbuf->size = ks.size; - statbuf->bytes = ks.blocks << SECTOR_SHIFT; - return 0; -} - /* * Grab the (locked) page for a memory object. The object cannot span a page * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index d56643b0f429..c602d11560d8 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -63,13 +63,6 @@ xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) loff_t xfile_seek_data(struct xfile *xf, loff_t pos); -struct xfile_stat { - loff_t size; - unsigned long long bytes; -}; - -int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf); - int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, struct xfile_page *xbuf); int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); -- cgit v1.2.3 From e47e2e0ba9103df7b3d25356421e6832c4d0e7be Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:21 +0100 Subject: xfs: remove the xfile_pread/pwrite APIs All current and pending xfile users use the xfile_obj_load and xfile_obj_store API, so make those the actual implementation. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- .../filesystems/xfs/xfs-online-fsck-design.rst | 23 ++++----- fs/xfs/scrub/rtsummary.c | 6 +-- fs/xfs/scrub/trace.h | 4 +- fs/xfs/scrub/xfarray.c | 18 ++++---- fs/xfs/scrub/xfile.c | 54 ++++++++++------------ fs/xfs/scrub/xfile.h | 32 +------------ 6 files changed, 48 insertions(+), 89 deletions(-) diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index 352516feef6f..216c99ce511f 100644 --- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -1915,19 +1915,13 @@ four of those five higher level data structures. The fifth use case is discussed in the :ref:`realtime summary ` case study. -The most general storage interface supported by the xfile enables the reading -and writing of arbitrary quantities of data at arbitrary offsets in the xfile. -This capability is provided by ``xfile_pread`` and ``xfile_pwrite`` functions, -which behave similarly to their userspace counterparts. XFS is very record-based, which suggests that the ability to load and store complete records is important. -To support these cases, a pair of ``xfile_obj_load`` and ``xfile_obj_store`` -functions are provided to read and persist objects into an xfile. -They are internally the same as pread and pwrite, except that they treat any -error as an out of memory error. -For online repair, squashing error conditions in this manner is an acceptable -behavior because the only reaction is to abort the operation back to userspace. -All five xfile usecases can be serviced by these four functions. +To support these cases, a pair of ``xfile_load`` and ``xfile_store`` +functions are provided to read and persist objects into an xfile that treat any +error as an out of memory error. For online repair, squashing error conditions +in this manner is an acceptable behavior because the only reaction is to abort +the operation back to userspace. However, no discussion of file access idioms is complete without answering the question, "But what about mmap?" @@ -1939,10 +1933,9 @@ tmpfs can only push a pagecache folio to the swap cache if the folio is neither pinned nor locked, which means the xfile must not pin too many folios. Short term direct access to xfile contents is done by locking the pagecache -folio and mapping it into kernel address space. -Programmatic access (e.g. pread and pwrite) uses this mechanism. -Folio locks are not supposed to be held for long periods of time, so long -term direct access to xfile contents is done by bumping the folio refcount, +folio and mapping it into kernel address space. Object load and store uses this +mechanism. Folio locks are not supposed to be held for long periods of time, so +long term direct access to xfile contents is done by bumping the folio refcount, mapping it into kernel address space, and dropping the folio lock. These long term users *must* be responsive to memory reclaim by hooking into the shrinker infrastructure to know when to release folios. diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index b1ff4f33324a..5055092bd9e8 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -119,7 +119,7 @@ xfsum_load( xfs_rtsumoff_t sumoff, union xfs_suminfo_raw *rawinfo) { - return xfile_obj_load(sc->xfile, rawinfo, + return xfile_load(sc->xfile, rawinfo, sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } @@ -130,7 +130,7 @@ xfsum_store( xfs_rtsumoff_t sumoff, const union xfs_suminfo_raw rawinfo) { - return xfile_obj_store(sc->xfile, &rawinfo, + return xfile_store(sc->xfile, &rawinfo, sizeof(union xfs_suminfo_raw), sumoff << XFS_WORDLOG); } @@ -142,7 +142,7 @@ xfsum_copyout( union xfs_suminfo_raw *rawinfo, unsigned int nr_words) { - return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, + return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG, sumoff << XFS_WORDLOG); } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 260b8fe0a802..0327cab606b0 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -903,8 +903,8 @@ DECLARE_EVENT_CLASS(xfile_class, DEFINE_EVENT(xfile_class, name, \ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \ TP_ARGS(xf, pos, bytecount)) -DEFINE_XFILE_EVENT(xfile_pread); -DEFINE_XFILE_EVENT(xfile_pwrite); +DEFINE_XFILE_EVENT(xfile_load); +DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); DEFINE_XFILE_EVENT(xfile_get_page); DEFINE_XFILE_EVENT(xfile_put_page); diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index f0f532c10a5a..95ac14bceead 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -136,7 +136,7 @@ xfarray_load( if (idx >= array->nr) return -ENODATA; - return xfile_obj_load(array->xfile, ptr, array->obj_size, + return xfile_load(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); } @@ -152,7 +152,7 @@ xfarray_is_unset( if (array->unset_slots == 0) return false; - error = xfile_obj_load(array->xfile, temp, array->obj_size, pos); + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (!error && xfarray_element_is_null(array, temp)) return true; @@ -184,7 +184,7 @@ xfarray_unset( return 0; memset(temp, 0, array->obj_size); - error = xfile_obj_store(array->xfile, temp, array->obj_size, pos); + error = xfile_store(array->xfile, temp, array->obj_size, pos); if (error) return error; @@ -209,7 +209,7 @@ xfarray_store( ASSERT(!xfarray_element_is_null(array, ptr)); - ret = xfile_obj_store(array->xfile, ptr, array->obj_size, + ret = xfile_store(array->xfile, ptr, array->obj_size, xfarray_pos(array, idx)); if (ret) return ret; @@ -245,12 +245,12 @@ xfarray_store_anywhere( for (pos = 0; pos < endpos && array->unset_slots > 0; pos += array->obj_size) { - error = xfile_obj_load(array->xfile, temp, array->obj_size, + error = xfile_load(array->xfile, temp, array->obj_size, pos); if (error || !xfarray_element_is_null(array, temp)) continue; - error = xfile_obj_store(array->xfile, ptr, array->obj_size, + error = xfile_store(array->xfile, ptr, array->obj_size, pos); if (error) return error; @@ -552,7 +552,7 @@ xfarray_isort( trace_xfarray_isort(si, lo, hi); xfarray_sort_bump_loads(si); - error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos); + error = xfile_load(si->array->xfile, scratch, len, lo_pos); if (error) return error; @@ -560,7 +560,7 @@ xfarray_isort( sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfile_obj_store(si->array->xfile, scratch, len, lo_pos); + return xfile_store(si->array->xfile, scratch, len, lo_pos); } /* Grab a page for sorting records. */ @@ -858,7 +858,7 @@ xfarray_sort_load_cached( if (xfarray_sort_terminated(si, &error)) return error; - return xfile_obj_load(si->array->xfile, ptr, + return xfile_load(si->array->xfile, ptr, si->array->obj_size, idx_pos); } diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index b212d4a9c780..c15cc8868880 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -101,13 +101,11 @@ xfile_destroy( } /* - * Read a memory object directly from the xfile's page cache. Unlike regular - * pread, we return -E2BIG and -EFBIG for reads that are too large or at too - * high an offset, instead of truncating the read. Otherwise, we return - * bytes read or an error code, like regular pread. + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pread( +int +xfile_load( struct xfile *xf, void *buf, size_t count, @@ -116,16 +114,15 @@ xfile_pread( struct inode *inode = file_inode(xf->file); struct address_space *mapping = inode->i_mapping; struct page *page = NULL; - ssize_t read = 0; unsigned int pflags; int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pread(xf, pos, count); + trace_xfile_load(xf, pos, count); pflags = memalloc_nofs_save(); while (count > 0) { @@ -143,8 +140,10 @@ xfile_pread( __GFP_NOWARN); if (IS_ERR(page)) { error = PTR_ERR(page); - if (error != -ENOMEM) + if (error != -ENOMEM) { + error = -ENOMEM; break; + } memset(buf, 0, len); goto advance; @@ -168,23 +167,18 @@ advance: count -= len; pos += len; buf += len; - read += len; } memalloc_nofs_restore(pflags); - if (read > 0) - return read; return error; } /* - * Write a memory object directly to the xfile's page cache. Unlike regular - * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too - * high an offset, instead of truncating the write. Otherwise, we return - * bytes written or an error code, like regular pwrite. + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. */ -ssize_t -xfile_pwrite( +int +xfile_store( struct xfile *xf, const void *buf, size_t count, @@ -194,16 +188,15 @@ xfile_pwrite( struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page = NULL; - ssize_t written = 0; unsigned int pflags; int error = 0; if (count > MAX_RW_COUNT) - return -E2BIG; + return -ENOMEM; if (inode->i_sb->s_maxbytes - pos < count) - return -EFBIG; + return -ENOMEM; - trace_xfile_pwrite(xf, pos, count); + trace_xfile_store(xf, pos, count); pflags = memalloc_nofs_save(); while (count > 0) { @@ -222,8 +215,10 @@ xfile_pwrite( */ error = aops->write_begin(NULL, mapping, pos, len, &page, &fsdata); - if (error) + if (error) { + error = -ENOMEM; break; + } /* * xfile pages must never be mapped into userspace, so we skip @@ -242,13 +237,14 @@ xfile_pwrite( ret = aops->write_end(NULL, mapping, pos, len, len, page, fsdata); if (ret < 0) { - error = ret; + error = -ENOMEM; break; } - written += ret; - if (ret != len) + if (ret != len) { + error = -ENOMEM; break; + } count -= ret; pos += ret; @@ -256,8 +252,6 @@ xfile_pwrite( } memalloc_nofs_restore(pflags); - if (written > 0) - return written; return error; } diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index c602d11560d8..465b10f492b6 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -29,38 +29,10 @@ struct xfile { int xfile_create(const char *description, loff_t isize, struct xfile **xfilep); void xfile_destroy(struct xfile *xf); -ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos); -ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count, +int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); +int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); -/* - * Load an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pread(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - -/* - * Store an object. Since we're treating this file as "memory", any error or - * short IO is treated as a failure to allocate memory. - */ -static inline int -xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos) -{ - ssize_t ret = xfile_pwrite(xf, buf, count, pos); - - if (ret < 0 || ret != count) - return -ENOMEM; - return 0; -} - loff_t xfile_seek_data(struct xfile *xf, loff_t pos); int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, -- cgit v1.2.3 From 0e2a24afb99258acc147f7fbe6c1e178a951247d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:22 +0100 Subject: xfs: don't try to handle non-update pages in xfile_obj_load shmem_read_mapping_page_gfp always returns an uptodate page or an ERR_PTR. Remove the code that tries to handle a non-uptodate page. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index c15cc8868880..4ec975977dcd 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -149,18 +149,14 @@ xfile_load( goto advance; } - if (PageUptodate(page)) { - /* - * xfile pages must never be mapped into userspace, so - * we skip the dcache flush. - */ - kaddr = kmap_local_page(page); - p = kaddr + offset_in_page(pos); - memcpy(buf, p, len); - kunmap_local(kaddr); - } else { - memset(buf, 0, len); - } + /* + * xfile pages must never be mapped into userspace, so + * we skip the dcache flush. + */ + kaddr = kmap_local_page(page); + p = kaddr + offset_in_page(pos); + memcpy(buf, p, len); + kunmap_local(kaddr); put_page(page); advance: -- cgit v1.2.3 From e62e26acc9ab85e996eff660318109470eae2607 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:23 +0100 Subject: xfs: don't allow highmem pages in xfile mappings XFS is generally used on 64-bit, non-highmem platforms and xfile mappings are accessed all the time. Reduce our pain by not allowing any highmem mappings in the xfile page cache and remove all the kmap calls for it. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfarray.c | 3 +-- fs/xfs/scrub/xfile.c | 21 +++++++++------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 95ac14bceead..d0f98a43b2ba 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -580,7 +580,7 @@ xfarray_sort_get_page( * xfile pages must never be mapped into userspace, so we skip the * dcache flush when mapping the page. */ - si->page_kaddr = kmap_local_page(si->xfpage.page); + si->page_kaddr = page_address(si->xfpage.page); return 0; } @@ -592,7 +592,6 @@ xfarray_sort_put_page( if (!si->page_kaddr) return 0; - kunmap_local(si->page_kaddr); si->page_kaddr = NULL; return xfile_put_page(si->array->xfile, &si->xfpage); diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 4ec975977dcd..009a760cb690 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -77,6 +77,12 @@ xfile_create( inode = file_inode(xf->file); lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key); + /* + * We don't want to bother with kmapping data during repair, so don't + * allow highmem pages to back this mapping. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + trace_xfile_create(xf); *xfilep = xf; @@ -126,7 +132,6 @@ xfile_load( pflags = memalloc_nofs_save(); while (count > 0) { - void *p, *kaddr; unsigned int len; len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); @@ -153,10 +158,7 @@ xfile_load( * xfile pages must never be mapped into userspace, so * we skip the dcache flush. */ - kaddr = kmap_local_page(page); - p = kaddr + offset_in_page(pos); - memcpy(buf, p, len); - kunmap_local(kaddr); + memcpy(buf, page_address(page) + offset_in_page(pos), len); put_page(page); advance: @@ -221,14 +223,13 @@ xfile_store( * the dcache flush. If the page is not uptodate, zero it * before writing data. */ - kaddr = kmap_local_page(page); + kaddr = page_address(page); if (!PageUptodate(page)) { memset(kaddr, 0, PAGE_SIZE); SetPageUptodate(page); } p = kaddr + offset_in_page(pos); memcpy(p, buf, len); - kunmap_local(kaddr); ret = aops->write_end(NULL, mapping, pos, len, len, page, fsdata); @@ -314,11 +315,7 @@ xfile_get_page( * to the caller and make sure the backing store will hold on to them. */ if (!PageUptodate(page)) { - void *kaddr; - - kaddr = kmap_local_page(page); - memset(kaddr, 0, PAGE_SIZE); - kunmap_local(kaddr); + memset(page_address(page), 0, PAGE_SIZE); SetPageUptodate(page); } -- cgit v1.2.3 From fd2634e2dd4539f96ab9e037f62ad2828f7a15eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:24 +0100 Subject: xfs: use shmem_get_folio in xfile_obj_store Switch to using shmem_get_folio and manually dirtying the page instead of abusing aops->write_begin and aops->write_end in xfile_get_page. This simplifies the code by not doing indirect calls of not actually exported interfaces that don't really fit the use case very well, and happens to get us large folio support for free. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 73 +++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 46 deletions(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 009a760cb690..a4480f4020ae 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -183,11 +183,7 @@ xfile_store( loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) return -ENOMEM; @@ -196,60 +192,45 @@ xfile_store( trace_xfile_store(xf, pos, count); + /* + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. + */ + if (pos + count > i_size_read(inode)) + i_size_write(inode, pos + count); + pflags = memalloc_nofs_save(); while (count > 0) { - void *fsdata = NULL; - void *p, *kaddr; + struct folio *folio; unsigned int len; - int ret; + unsigned int offset; - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. - * shmem doesn't support fs freeze, but lockdep doesn't know - * that and will trip over that. - */ - error = aops->write_begin(NULL, mapping, pos, len, &page, - &fsdata); - if (error) { - error = -ENOMEM; + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + SGP_CACHE) < 0) break; - } - - /* - * xfile pages must never be mapped into userspace, so we skip - * the dcache flush. If the page is not uptodate, zero it - * before writing data. - */ - kaddr = page_address(page); - if (!PageUptodate(page)) { - memset(kaddr, 0, PAGE_SIZE); - SetPageUptodate(page); - } - p = kaddr + offset_in_page(pos); - memcpy(p, buf, len); - - ret = aops->write_end(NULL, mapping, pos, len, len, page, - fsdata); - if (ret < 0) { - error = -ENOMEM; + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); break; } - if (ret != len) { - error = -ENOMEM; - break; - } + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(folio_address(folio) + offset, buf, len); + + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); - count -= ret; - pos += ret; - buf += ret; + count -= len; + pos += len; + buf += len; } memalloc_nofs_restore(pflags); - return error; + if (count) + return -ENOMEM; + return 0; } /* Find the next written area in the xfile data for a given offset. */ -- cgit v1.2.3 From e97d70a57370817af59bb83f4219cd8aa63b81ed Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:25 +0100 Subject: xfs: use shmem_get_folio in in xfile_load Switch to using shmem_get_folio in xfile_load instead of using shmem_read_mapping_page_gfp. This gets us support for large folios and also optimized reading from unallocated space, as shmem_get_folio with SGP_READ won't allocate a page for them just to zero the content. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfile.c | 61 ++++++++++++++++++++++------------------------------ 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index a4480f4020ae..2229f0b7f9ca 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -34,13 +34,6 @@ * xfiles assume that the caller will handle all required concurrency * management; standard vfs locks (freezer and inode) are not taken. Reads * and writes are satisfied directly from the page cache. - * - * NOTE: The current shmemfs implementation has a quirk that in-kernel reads - * of a hole cause a page to be mapped into the file. If you are going to - * create a sparse xfile, please be careful about reading from uninitialized - * parts of the file. These pages are !Uptodate and will eventually be - * reclaimed if not written, but in the short term this boosts memory - * consumption. */ /* @@ -118,10 +111,7 @@ xfile_load( loff_t pos) { struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - struct page *page = NULL; unsigned int pflags; - int error = 0; if (count > MAX_RW_COUNT) return -ENOMEM; @@ -132,43 +122,44 @@ xfile_load( pflags = memalloc_nofs_save(); while (count > 0) { + struct folio *folio; unsigned int len; + unsigned int offset; - len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos)); - - /* - * In-kernel reads of a shmem file cause it to allocate a page - * if the mapping shows a hole. Therefore, if we hit ENOMEM - * we can continue by zeroing the caller's buffer. - */ - page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT, - __GFP_NOWARN); - if (IS_ERR(page)) { - error = PTR_ERR(page); - if (error != -ENOMEM) { - error = -ENOMEM; + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + SGP_READ) < 0) + break; + if (!folio) { + /* + * No data stored at this offset, just zero the output + * buffer until the next page boundary. + */ + len = min_t(ssize_t, count, + PAGE_SIZE - offset_in_page(pos)); + memset(buf, 0, len); + } else { + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); break; } - memset(buf, 0, len); - goto advance; - } - - /* - * xfile pages must never be mapped into userspace, so - * we skip the dcache flush. - */ - memcpy(buf, page_address(page) + offset_in_page(pos), len); - put_page(page); + offset = offset_in_folio(folio, pos); + len = min_t(ssize_t, count, folio_size(folio) - offset); + memcpy(buf, folio_address(folio) + offset, len); -advance: + folio_unlock(folio); + folio_put(folio); + } count -= len; pos += len; buf += len; } memalloc_nofs_restore(pflags); - return error; + if (count) + return -ENOMEM; + return 0; } /* -- cgit v1.2.3 From 6907e3c00a4023d2456d17ba156d1e03aec4f185 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 19 Feb 2024 07:27:26 +0100 Subject: xfs: add file_{get,put}_folio Add helper similar to file_{get,set}_page, but which deal with folios and don't allocate new folio unless explicitly asked to, which map to shmem_get_folio instead of calling into the aops. Signed-off-by: "Darrick J. Wong" Signed-off-by: Christoph Hellwig Reviewed-by: Kent Overstreet Signed-off-by: Chandan Babu R --- fs/xfs/scrub/trace.h | 2 ++ fs/xfs/scrub/xfile.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/xfile.h | 7 +++++ 3 files changed, 83 insertions(+) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 0327cab606b0..c61fa7a95ef5 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -908,6 +908,8 @@ DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); DEFINE_XFILE_EVENT(xfile_get_page); DEFINE_XFILE_EVENT(xfile_put_page); +DEFINE_XFILE_EVENT(xfile_get_folio); +DEFINE_XFILE_EVENT(xfile_put_folio); TRACE_EVENT(xfarray_create, TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity), diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 2229f0b7f9ca..0cab9d529365 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -340,3 +340,77 @@ xfile_put_page( return -EIO; return 0; } + +/* + * Grab the (locked) folio for a memory object. The object cannot span a folio + * boundary. Returns the locked folio if successful, NULL if there was no + * folio or it didn't cover the range requested, or an ERR_PTR on failure. + */ +struct folio * +xfile_get_folio( + struct xfile *xf, + loff_t pos, + size_t len, + unsigned int flags) +{ + struct inode *inode = file_inode(xf->file); + struct folio *folio = NULL; + unsigned int pflags; + int error; + + if (inode->i_sb->s_maxbytes - pos < len) + return ERR_PTR(-ENOMEM); + + trace_xfile_get_folio(xf, pos, len); + + /* + * Increase the file size first so that shmem_get_folio(..., SGP_CACHE), + * actually allocates a folio instead of erroring out. + */ + if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode)) + i_size_write(inode, pos + len); + + pflags = memalloc_nofs_save(); + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, + (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ); + memalloc_nofs_restore(pflags); + if (error) + return ERR_PTR(error); + + if (!folio) + return NULL; + + if (len > folio_size(folio) - offset_in_folio(folio, pos)) { + folio_unlock(folio); + folio_put(folio); + return NULL; + } + + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-EIO); + } + + /* + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfile_put_folio. + */ + if (flags & XFILE_ALLOC) + folio_set_dirty(folio); + return folio; +} + +/* + * Release the (locked) folio for a memory object. + */ +void +xfile_put_folio( + struct xfile *xf, + struct folio *folio) +{ + trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio)); + + folio_unlock(folio); + folio_put(folio); +} diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index 465b10f492b6..afb75e9fbaf2 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -39,4 +39,11 @@ int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, struct xfile_page *xbuf); int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); +#define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) + +#define XFILE_ALLOC (1 << 0) /* allocate folio if not present */ +struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len, + unsigned int flags); +void xfile_put_folio(struct xfile *xf, struct folio *folio); + #endif /* __XFS_SCRUB_XFILE_H__ */ -- cgit v1.2.3 From fd3d46e630404202c1d577481fafd2c8993874f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:27 +0100 Subject: xfs: remove xfarray_sortinfo.page_kaddr Now that xfile pages don't need kmapping, there is no need to cache the kernel virtual address for them. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfarray.c | 22 ++++------------------ fs/xfs/scrub/xfarray.h | 1 - 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index d0f98a43b2ba..82b2a35a8e86 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -570,18 +570,7 @@ xfarray_sort_get_page( loff_t pos, uint64_t len) { - int error; - - error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage); - if (error) - return error; - - /* - * xfile pages must never be mapped into userspace, so we skip the - * dcache flush when mapping the page. - */ - si->page_kaddr = page_address(si->xfpage.page); - return 0; + return xfile_get_page(si->array->xfile, pos, len, &si->xfpage); } /* Release a page we grabbed for sorting records. */ @@ -589,11 +578,8 @@ static inline int xfarray_sort_put_page( struct xfarray_sortinfo *si) { - if (!si->page_kaddr) + if (!xfile_page_cached(&si->xfpage)) return 0; - - si->page_kaddr = NULL; - return xfile_put_page(si->array->xfile, &si->xfpage); } @@ -636,7 +622,7 @@ xfarray_pagesort( return error; xfarray_sort_bump_heapsorts(si); - startp = si->page_kaddr + offset_in_page(lo_pos); + startp = page_address(si->xfpage.page) + offset_in_page(lo_pos); sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); @@ -883,7 +869,7 @@ xfarray_sort_load_cached( return error; } - memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos), + memcpy(ptr, page_address(si->xfpage.page) + offset_in_page(idx_pos), si->array->obj_size); return 0; } diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 62b9c506fdd1..6f2862054e19 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -107,7 +107,6 @@ struct xfarray_sortinfo { /* Cache a page here for faster access. */ struct xfile_page xfpage; - void *page_kaddr; #ifdef DEBUG /* Performance statistics. */ -- cgit v1.2.3 From b2fdfe19dfd70ba047ab720eb52f00651ef8a5cd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Feb 2024 07:27:28 +0100 Subject: xfs: fix a comment in xfarray.c xfiles are shmem files, not memfds. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/xfarray.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 82b2a35a8e86..379e1db22269 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -16,7 +16,7 @@ * Large Arrays of Fixed-Size Records * ================================== * - * This memory array uses an xfile (which itself is a memfd "file") to store + * This memory array uses an xfile (which itself is a shmem file) to store * large numbers of fixed-size records in memory that can be paged out. This * puts less stress on the memory reclaim algorithms during an online repair * because we don't have to pin so much memory. However, array access is less -- cgit v1.2.3 From ee13fc67205b989c93c89a4800a4b1c84e448dde Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 19 Feb 2024 07:27:29 +0100 Subject: xfs: convert xfarray_pagesort to deal with large folios Convert xfarray_pagesort to handle large folios by introducing a new xfile_get_folio routine that can return a folio of arbitrary size, and using heapsort on the full folio. This also corrects an off-by-one bug in the calculation of len in xfarray_pagesort that was papered over by xfarray_want_pagesort. Signed-off-by: "Darrick J. Wong" Signed-off-by: Christoph Hellwig Reviewed-by: Kent Overstreet Signed-off-by: Chandan Babu R --- fs/xfs/scrub/trace.h | 43 ++++++++++- fs/xfs/scrub/xfarray.c | 201 +++++++++++++++++++++++-------------------------- fs/xfs/scrub/xfarray.h | 10 ++- 3 files changed, 143 insertions(+), 111 deletions(-) diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c61fa7a95ef5..3a1a827828dc 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -956,7 +956,7 @@ TRACE_EVENT(xfarray_isort, __entry->hi - __entry->lo) ); -TRACE_EVENT(xfarray_pagesort, +TRACE_EVENT(xfarray_foliosort, TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi), TP_ARGS(si, lo, hi), TP_STRUCT__entry( @@ -1027,6 +1027,47 @@ TRACE_EVENT(xfarray_sort, __entry->bytes) ); +TRACE_EVENT(xfarray_sort_scan, + TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx), + TP_ARGS(si, idx), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, nr) + __field(size_t, obj_size) + __field(unsigned long long, idx) + __field(unsigned long long, folio_pos) + __field(unsigned long, folio_bytes) + __field(unsigned long long, first_idx) + __field(unsigned long long, last_idx) + ), + TP_fast_assign( + __entry->nr = si->array->nr; + __entry->obj_size = si->array->obj_size; + __entry->ino = file_inode(si->array->xfile->file)->i_ino; + __entry->idx = idx; + if (si->folio) { + __entry->folio_pos = folio_pos(si->folio); + __entry->folio_bytes = folio_size(si->folio); + __entry->first_idx = si->first_folio_idx; + __entry->last_idx = si->last_folio_idx; + } else { + __entry->folio_pos = 0; + __entry->folio_bytes = 0; + __entry->first_idx = 0; + __entry->last_idx = 0; + } + ), + TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu", + __entry->ino, + __entry->nr, + __entry->obj_size, + __entry->idx, + __entry->folio_pos, + __entry->folio_bytes, + __entry->first_idx, + __entry->last_idx) +); + TRACE_EVENT(xfarray_sort_stats, TP_PROTO(struct xfarray_sortinfo *si, int error), TP_ARGS(si, error), diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c index 379e1db22269..17c982a4821d 100644 --- a/fs/xfs/scrub/xfarray.c +++ b/fs/xfs/scrub/xfarray.c @@ -563,70 +563,42 @@ xfarray_isort( return xfile_store(si->array->xfile, scratch, len, lo_pos); } -/* Grab a page for sorting records. */ -static inline int -xfarray_sort_get_page( - struct xfarray_sortinfo *si, - loff_t pos, - uint64_t len) -{ - return xfile_get_page(si->array->xfile, pos, len, &si->xfpage); -} - -/* Release a page we grabbed for sorting records. */ -static inline int -xfarray_sort_put_page( - struct xfarray_sortinfo *si) -{ - if (!xfile_page_cached(&si->xfpage)) - return 0; - return xfile_put_page(si->array->xfile, &si->xfpage); -} - -/* Decide if these records are eligible for in-page sorting. */ -static inline bool -xfarray_want_pagesort( - struct xfarray_sortinfo *si, - xfarray_idx_t lo, - xfarray_idx_t hi) -{ - pgoff_t lo_page; - pgoff_t hi_page; - loff_t end_pos; - - /* We can only map one page at a time. */ - lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT; - end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1; - hi_page = end_pos >> PAGE_SHIFT; - - return lo_page == hi_page; -} - -/* Sort a bunch of records that all live in the same memory page. */ +/* + * Sort the records from lo to hi (inclusive) if they are all backed by the + * same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative + * errno. + */ STATIC int -xfarray_pagesort( +xfarray_foliosort( struct xfarray_sortinfo *si, xfarray_idx_t lo, xfarray_idx_t hi) { + struct folio *folio; void *startp; loff_t lo_pos = xfarray_pos(si->array, lo); - uint64_t len = xfarray_pos(si->array, hi - lo); - int error = 0; + uint64_t len = xfarray_pos(si->array, hi - lo + 1); - trace_xfarray_pagesort(si, lo, hi); + /* No single folio could back this many records. */ + if (len > XFILE_MAX_FOLIO_SIZE) + return 0; xfarray_sort_bump_loads(si); - error = xfarray_sort_get_page(si, lo_pos, len); - if (error) - return error; + folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC); + if (IS_ERR(folio)) + return PTR_ERR(folio); + if (!folio) + return 0; + + trace_xfarray_foliosort(si, lo, hi); xfarray_sort_bump_heapsorts(si); - startp = page_address(si->xfpage.page) + offset_in_page(lo_pos); + startp = folio_address(folio) + offset_in_folio(folio, lo_pos); sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL); xfarray_sort_bump_stores(si); - return xfarray_sort_put_page(si); + xfile_put_folio(si->array->xfile, folio); + return 1; } /* Return a pointer to the xfarray pivot record within the sortinfo struct. */ @@ -814,63 +786,78 @@ xfarray_qsort_push( return 0; } +static inline void +xfarray_sort_scan_done( + struct xfarray_sortinfo *si) +{ + if (si->folio) + xfile_put_folio(si->array->xfile, si->folio); + si->folio = NULL; +} + /* - * Load an element from the array into the first scratchpad and cache the page, - * if possible. + * Cache the folio backing the start of the given array element. If the array + * element is contained entirely within the folio, return a pointer to the + * cached folio. Otherwise, load the element into the scratchpad and return a + * pointer to the scratchpad. */ static inline int -xfarray_sort_load_cached( +xfarray_sort_scan( struct xfarray_sortinfo *si, xfarray_idx_t idx, - void *ptr) + void **ptrp) { loff_t idx_pos = xfarray_pos(si->array, idx); - pgoff_t startpage; - pgoff_t endpage; int error = 0; - /* - * If this load would split a page, release the cached page, if any, - * and perform a traditional read. - */ - startpage = idx_pos >> PAGE_SHIFT; - endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT; - if (startpage != endpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + if (xfarray_sort_terminated(si, &error)) + return error; - if (xfarray_sort_terminated(si, &error)) - return error; + trace_xfarray_sort_scan(si, idx); - return xfile_load(si->array->xfile, ptr, - si->array->obj_size, idx_pos); - } + /* If the cached folio doesn't cover this index, release it. */ + if (si->folio && + (idx < si->first_folio_idx || idx > si->last_folio_idx)) + xfarray_sort_scan_done(si); - /* If the cached page is not the one we want, release it. */ - if (xfile_page_cached(&si->xfpage) && - xfile_page_index(&si->xfpage) != startpage) { - error = xfarray_sort_put_page(si); - if (error) - return error; + /* Grab the first folio that backs this array element. */ + if (!si->folio) { + loff_t next_pos; + + si->folio = xfile_get_folio(si->array->xfile, idx_pos, + si->array->obj_size, XFILE_ALLOC); + if (IS_ERR(si->folio)) + return PTR_ERR(si->folio); + + si->first_folio_idx = xfarray_idx(si->array, + folio_pos(si->folio) + si->array->obj_size - 1); + + next_pos = folio_pos(si->folio) + folio_size(si->folio); + si->last_folio_idx = xfarray_idx(si->array, next_pos - 1); + if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos) + si->last_folio_idx--; + + trace_xfarray_sort_scan(si, idx); } /* - * If we don't have a cached page (and we know the load is contained - * in a single page) then grab it. + * If this folio still doesn't cover the desired element, it must cross + * a folio boundary. Read into the scratchpad and we're done. */ - if (!xfile_page_cached(&si->xfpage)) { - if (xfarray_sort_terminated(si, &error)) - return error; + if (idx < si->first_folio_idx || idx > si->last_folio_idx) { + void *temp = xfarray_scratch(si->array); - error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT, - PAGE_SIZE); + error = xfile_load(si->array->xfile, temp, si->array->obj_size, + idx_pos); if (error) return error; + + *ptrp = temp; + return 0; } - memcpy(ptr, page_address(si->xfpage.page) + offset_in_page(idx_pos), - si->array->obj_size); + /* Otherwise return a pointer to the array element in the folio. */ + *ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos); return 0; } @@ -937,6 +924,8 @@ xfarray_sort( pivot = xfarray_sortinfo_pivot(si); while (si->stack_depth >= 0) { + int ret; + lo = si_lo[si->stack_depth]; hi = si_hi[si->stack_depth]; @@ -949,13 +938,13 @@ xfarray_sort( } /* - * If directly mapping the page and sorting can solve our + * If directly mapping the folio and sorting can solve our * problems, we're done. */ - if (xfarray_want_pagesort(si, lo, hi)) { - error = xfarray_pagesort(si, lo, hi); - if (error) - goto out_free; + ret = xfarray_foliosort(si, lo, hi); + if (ret < 0) + goto out_free; + if (ret == 1) { si->stack_depth--; continue; } @@ -980,25 +969,24 @@ xfarray_sort( * than the pivot is on the right side of the range. */ while (lo < hi) { + void *p; + /* * Decrement hi until it finds an a[hi] less than the * pivot value. */ - error = xfarray_sort_load_cached(si, hi, scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) >= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) { hi--; - error = xfarray_sort_load_cached(si, hi, - scratch); + error = xfarray_sort_scan(si, hi, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; @@ -1013,21 +1001,18 @@ xfarray_sort( * Increment lo until it finds an a[lo] greater than * the pivot value. */ - error = xfarray_sort_load_cached(si, lo, scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; - while (xfarray_sort_cmp(si, scratch, pivot) <= 0 && - lo < hi) { + while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) { lo++; - error = xfarray_sort_load_cached(si, lo, - scratch); + error = xfarray_sort_scan(si, lo, &p); if (error) goto out_free; } - error = xfarray_sort_put_page(si); - if (error) - goto out_free; - + if (p != scratch) + memcpy(scratch, p, si->array->obj_size); + xfarray_sort_scan_done(si); if (xfarray_sort_terminated(si, &error)) goto out_free; diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 6f2862054e19..ec643cc9fc14 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -105,8 +105,14 @@ struct xfarray_sortinfo { /* XFARRAY_SORT_* flags; see below. */ unsigned int flags; - /* Cache a page here for faster access. */ - struct xfile_page xfpage; + /* Cache a folio here for faster scanning for pivots */ + struct folio *folio; + + /* First array index in folio that is completely readable */ + xfarray_idx_t first_folio_idx; + + /* Last array index in folio that is completely readable */ + xfarray_idx_t last_folio_idx; #ifdef DEBUG /* Performance statistics. */ -- cgit v1.2.3 From e5a2f47cff812c01018652f11d5e861e2a6b462b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 19 Feb 2024 07:27:30 +0100 Subject: xfs: remove xfile_{get,put}_page These functions aren't used anymore, so get rid of them. Signed-off-by: "Darrick J. Wong" Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- .../filesystems/xfs/xfs-online-fsck-design.rst | 2 +- fs/xfs/scrub/trace.h | 2 - fs/xfs/scrub/xfile.c | 104 --------------------- fs/xfs/scrub/xfile.h | 20 ---- 4 files changed, 1 insertion(+), 127 deletions(-) diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index 216c99ce511f..d03480266e9b 100644 --- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -1940,7 +1940,7 @@ mapping it into kernel address space, and dropping the folio lock. These long term users *must* be responsive to memory reclaim by hooking into the shrinker infrastructure to know when to release folios. -The ``xfile_get_page`` and ``xfile_put_page`` functions are provided to +The ``xfile_get_folio`` and ``xfile_put_folio`` functions are provided to retrieve the (locked) folio that backs part of an xfile and to release it. The only code to use these folio lease functions are the xfarray :ref:`sorting` algorithms and the :ref:`in-memory diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3a1a827828dc..ae6b2385a8cb 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -906,8 +906,6 @@ DEFINE_EVENT(xfile_class, name, \ DEFINE_XFILE_EVENT(xfile_load); DEFINE_XFILE_EVENT(xfile_store); DEFINE_XFILE_EVENT(xfile_seek_data); -DEFINE_XFILE_EVENT(xfile_get_page); -DEFINE_XFILE_EVENT(xfile_put_page); DEFINE_XFILE_EVENT(xfile_get_folio); DEFINE_XFILE_EVENT(xfile_put_folio); diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index 0cab9d529365..8cdd863db585 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -237,110 +237,6 @@ xfile_seek_data( return ret; } -/* - * Grab the (locked) page for a memory object. The object cannot span a page - * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we - * cannot grab the page, or the usual negative errno. - */ -int -xfile_get_page( - struct xfile *xf, - loff_t pos, - unsigned int len, - struct xfile_page *xfpage) -{ - struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - struct page *page = NULL; - void *fsdata = NULL; - loff_t key = round_down(pos, PAGE_SIZE); - unsigned int pflags; - int error; - - if (inode->i_sb->s_maxbytes - pos < len) - return -ENOMEM; - if (len > PAGE_SIZE - offset_in_page(pos)) - return -ENOTBLK; - - trace_xfile_get_page(xf, pos, len); - - pflags = memalloc_nofs_save(); - - /* - * We call write_begin directly here to avoid all the freezer - * protection lock-taking that happens in the normal path. shmem - * doesn't support fs freeze, but lockdep doesn't know that and will - * trip over that. - */ - error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page, - &fsdata); - if (error) - goto out_pflags; - - /* We got the page, so make sure we push out EOF. */ - if (i_size_read(inode) < pos + len) - i_size_write(inode, pos + len); - - /* - * If the page isn't up to date, fill it with zeroes before we hand it - * to the caller and make sure the backing store will hold on to them. - */ - if (!PageUptodate(page)) { - memset(page_address(page), 0, PAGE_SIZE); - SetPageUptodate(page); - } - - /* - * Mark each page dirty so that the contents are written to some - * backing store when we drop this buffer, and take an extra reference - * to prevent the xfile page from being swapped or removed from the - * page cache by reclaim if the caller unlocks the page. - */ - set_page_dirty(page); - get_page(page); - - xfpage->page = page; - xfpage->fsdata = fsdata; - xfpage->pos = key; -out_pflags: - memalloc_nofs_restore(pflags); - return error; -} - -/* - * Release the (locked) page for a memory object. Returns 0 or a negative - * errno. - */ -int -xfile_put_page( - struct xfile *xf, - struct xfile_page *xfpage) -{ - struct inode *inode = file_inode(xf->file); - struct address_space *mapping = inode->i_mapping; - const struct address_space_operations *aops = mapping->a_ops; - unsigned int pflags; - int ret; - - trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE); - - /* Give back the reference that we took in xfile_get_page. */ - put_page(xfpage->page); - - pflags = memalloc_nofs_save(); - ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE, - xfpage->page, xfpage->fsdata); - memalloc_nofs_restore(pflags); - memset(xfpage, 0, sizeof(struct xfile_page)); - - if (ret < 0) - return ret; - if (ret != PAGE_SIZE) - return -EIO; - return 0; -} - /* * Grab the (locked) folio for a memory object. The object cannot span a folio * boundary. Returns the locked folio if successful, NULL if there was no diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h index afb75e9fbaf2..76d78dba7e34 100644 --- a/fs/xfs/scrub/xfile.h +++ b/fs/xfs/scrub/xfile.h @@ -6,22 +6,6 @@ #ifndef __XFS_SCRUB_XFILE_H__ #define __XFS_SCRUB_XFILE_H__ -struct xfile_page { - struct page *page; - void *fsdata; - loff_t pos; -}; - -static inline bool xfile_page_cached(const struct xfile_page *xfpage) -{ - return xfpage->page != NULL; -} - -static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage) -{ - return xfpage->page->index; -} - struct xfile { struct file *file; }; @@ -35,10 +19,6 @@ int xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t xfile_seek_data(struct xfile *xf, loff_t pos); -int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len, - struct xfile_page *xbuf); -int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf); - #define XFILE_MAX_FOLIO_SIZE (PAGE_SIZE << MAX_PAGECACHE_ORDER) #define XFILE_ALLOC (1 << 0) /* allocate folio if not present */ -- cgit v1.2.3 From 4b2f459d86252619448455013f581836c8b1b7da Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 21 Feb 2024 09:49:28 +1100 Subject: xfs: fix SEEK_HOLE/DATA for regions with active COW extents A data corruption problem was reported by CoreOS image builders when using reflink based disk image copies and then converting them to qcow2 images. The converted images failed the conversion verification step, and it was isolated down to the fact that qemu-img uses SEEK_HOLE/SEEK_DATA to find the data it is supposed to copy. The reproducer allowed me to isolate the issue down to a region of the file that had overlapping data and COW fork extents, and the problem was that the COW fork extent was being reported in it's entirity by xfs_seek_iomap_begin() and so skipping over the real data fork extents in that range. This was somewhat hidden by the fact that 'xfs_bmap -vvp' reported all the extents correctly, and reading the file completely (i.e. not using seek to skip holes) would map the file correctly and all the correct data extents are read. Hence the problem is isolated to just the xfs_seek_iomap_begin() implementation. Instrumentation with trace_printk made the problem obvious: we are passing the wrong length to xfs_trim_extent() in xfs_seek_iomap_begin(). We are passing the end_fsb, not the maximum length of the extent we want to trim the map too. Hence the COW extent map never gets trimmed to the start of the next data fork extent, and so the seek code treats the entire COW fork extent as unwritten and skips entirely over the data fork extents in that range. Link: https://github.com/coreos/coreos-assembler/issues/3728 Fixes: 60271ab79d40 ("xfs: fix SEEK_DATA for speculative COW fork preallocation") Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18c8f168b153..055cdec2e9ad 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1323,7 +1323,7 @@ xfs_seek_iomap_begin( if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); - xfs_trim_extent(&cmap, offset_fsb, end_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); @@ -1348,7 +1348,7 @@ xfs_seek_iomap_begin( imap.br_state = XFS_EXT_NORM; done: seq = xfs_iomap_inode_sequence(ip, 0); - xfs_trim_extent(&imap, offset_fsb, end_fsb); + xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); -- cgit v1.2.3 From ae05eb117108428895e75a30e7fe3dbf1016cf93 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:44 -0800 Subject: xfs: speed up xfs_iwalk_adjust_start a little bit Replace the open-coded loop that recomputes freecount with a single call to a bit weight function. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_iwalk.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 6d2eb6364867..c1e9c7bcb6a9 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -22,6 +22,7 @@ #include "xfs_trans.h" #include "xfs_pwork.h" #include "xfs_ag.h" +#include "xfs_bit.h" /* * Walking Inodes in the Filesystem @@ -131,21 +132,11 @@ xfs_iwalk_adjust_start( struct xfs_inobt_rec_incore *irec) /* btree record */ { int idx; /* index into inode chunk */ - int i; idx = agino - irec->ir_startino; - /* - * We got a right chunk with some left inodes allocated at it. Grab - * the chunk record. Mark all the uninteresting inodes free because - * they're before our start point. - */ - for (i = 0; i < idx; i++) { - if (XFS_INOBT_MASK(i) & ~irec->ir_free) - irec->ir_freecount++; - } - irec->ir_free |= xfs_inobt_maskn(0, idx); + irec->ir_freecount = hweight64(irec->ir_free); } /* Allocate memory for a walk. */ -- cgit v1.2.3 From 8660c7b74aeab67210064a8dc756960b2adfd613 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:45 -0800 Subject: xfs: implement live inode scan for scrub This patch implements a live file scanner for online fsck functions that require the ability to walk a filesystem to gather metadata records and stay informed about metadata changes to files that have already been visited. The iscan structure consists of two inode number cursors: one to track which inode we want to visit next, and a second one to track which inodes have already been visited. This second cursor is key to capturing live updates to files previously scanned while the main thread continues scanning -- any inode greater than this value hasn't been scanned and can go on its way; any other update must be incorporated into the collected data. It is critical for the scanning thraad to hold exclusive access on the inode until after marking the inode visited. This new code is a separate patch from the patchsets adding callers for the sake of enabling the author to move patches around his tree with ease. The intended usage model for this code is roughly: xchk_iscan_start(iscan, 0, 0); while ((error = xchk_iscan_iter(sc, iscan, &ip)) == 1) { xfs_ilock(ip, ...); /* capture inode metadata */ xchk_iscan_mark_visited(iscan, ip); xfs_iunlock(ip, ...); xfs_irele(ip); } xchk_iscan_stop(iscan); if (error) return error; Hook functions for live updates can then do: if (xchk_iscan_want_live_update(...)) /* update the captured inode metadata */ Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/iscan.c | 485 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/iscan.h | 63 +++++++ fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 106 +++++++++++ 5 files changed, 656 insertions(+) create mode 100644 fs/xfs/scrub/iscan.c create mode 100644 fs/xfs/scrub/iscan.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 35a23427055b..17af5bde5308 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -158,6 +158,7 @@ xfs-y += $(addprefix scrub/, \ health.o \ ialloc.o \ inode.o \ + iscan.o \ parent.o \ readdir.o \ refcount.o \ diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c new file mode 100644 index 000000000000..d13fc3b60f2e --- /dev/null +++ b/fs/xfs/scrub/iscan.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_bit.h" +#include "xfs_icache.h" +#include "scrub/scrub.h" +#include "scrub/iscan.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * Live File Scan + * ============== + * + * Live file scans walk every inode in a live filesystem. This is more or + * less like a regular iwalk, except that when we're advancing the scan cursor, + * we must ensure that inodes cannot be added or deleted anywhere between the + * old cursor value and the new cursor value. If we're advancing the cursor + * by one inode, the caller must hold that inode; if we're finding the next + * inode to scan, we must grab the AGI and hold it until we've updated the + * scan cursor. + * + * Callers are expected to use this code to scan all files in the filesystem to + * construct a new metadata index of some kind. The scan races against other + * live updates, which means there must be a provision to update the new index + * when updates are made to inodes that already been scanned. The iscan lock + * can be used in live update hook code to stop the scan and protect this data + * structure. + * + * To keep the new index up to date with other metadata updates being made to + * the live filesystem, it is assumed that the caller will add hooks as needed + * to be notified when a metadata update occurs. The inode scanner must tell + * the hook code when an inode has been visited with xchk_iscan_mark_visit. + * Hook functions can use xchk_iscan_want_live_update to decide if the + * scanner's observations must be updated. + */ + +/* + * Set *cursor to the next allocated inode after whatever it's set to now. + * If there are no more inodes in this AG, cursor is set to NULLAGINO. + */ +STATIC int +xchk_iscan_find_next( + struct xchk_iscan *iscan, + struct xfs_buf *agi_bp, + struct xfs_perag *pag, + xfs_agino_t *cursor) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + xfs_agnumber_t agno = pag->pag_agno; + xfs_agino_t lastino = NULLAGINO; + xfs_agino_t first, last; + xfs_agino_t agino = *cursor; + int has_rec; + int error; + + /* If the cursor is beyond the end of this AG, move to the next one. */ + xfs_agino_range(mp, agno, &first, &last); + if (agino > last) { + *cursor = NULLAGINO; + return 0; + } + + /* + * Look up the inode chunk for the current cursor position. If there + * is no chunk here, we want the next one. + */ + cur = xfs_inobt_init_cursor(pag, tp, agi_bp, XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); + if (!error && !has_rec) + error = xfs_btree_increment(cur, 0, &has_rec); + for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) { + xfs_inofree_t allocmask; + + /* + * If we've run out of inobt records in this AG, move the + * cursor on to the next AG and exit. The caller can try + * again with the next AG. + */ + if (!has_rec) { + *cursor = NULLAGINO; + break; + } + + error = xfs_inobt_get_rec(cur, &rec, &has_rec); + if (error) + break; + if (!has_rec) { + error = -EFSCORRUPTED; + break; + } + + /* Make sure that we always move forward. */ + if (lastino != NULLAGINO && + XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) { + error = -EFSCORRUPTED; + break; + } + lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1; + + /* + * If this record only covers inodes that come before the + * cursor, advance to the next record. + */ + if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) + continue; + + /* + * If the incoming lookup put us in the middle of an inobt + * record, mark it and the previous inodes "free" so that the + * search for allocated inodes will start at the cursor. + * We don't care about ir_freecount here. + */ + if (agino >= rec.ir_startino) + rec.ir_free |= xfs_inobt_maskn(0, + agino + 1 - rec.ir_startino); + + /* + * If there are allocated inodes in this chunk, find them + * and update the scan cursor. + */ + allocmask = ~rec.ir_free; + if (hweight64(allocmask) > 0) { + int next = xfs_lowbit64(allocmask); + + ASSERT(next >= 0); + *cursor = rec.ir_startino + next; + break; + } + } + + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Advance both the scan and the visited cursors. + * + * The inumber address space for a given filesystem is sparse, which means that + * the scan cursor can jump a long ways in a single iter() call. There are no + * inodes in these sparse areas, so we must move the visited cursor forward at + * the same time so that the scan user can receive live updates for inodes that + * may get created once we release the AGI buffer. + */ +static inline void +xchk_iscan_move_cursor( + struct xchk_iscan *iscan, + xfs_agnumber_t agno, + xfs_agino_t agino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = XFS_AGINO_TO_INO(mp, agno, agino); + iscan->__visited_ino = iscan->cursor_ino - 1; + trace_xchk_iscan_move_cursor(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Prepare to return agno/agino to the iscan caller by moving the lastino + * cursor to the previous inode. Do this while we still hold the AGI so that + * no other threads can create or delete inodes in this AG. + */ +static inline void +xchk_iscan_finish( + struct xchk_iscan *iscan) +{ + mutex_lock(&iscan->lock); + iscan->cursor_ino = NULLFSINO; + + /* All live updates will be applied from now on */ + iscan->__visited_ino = NULLFSINO; + + mutex_unlock(&iscan->lock); +} + +/* + * Advance ino to the next inode that the inobt thinks is allocated, being + * careful to jump to the next AG if we've reached the right end of this AG's + * inode btree. Advancing ino effectively means that we've pushed the inode + * scan forward, so set the iscan cursor to (ino - 1) so that our live update + * predicates will track inode allocations in that part of the inode number + * key space once we release the AGI buffer. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, or the usual negative errno. + */ +STATIC int +xchk_iscan_advance( + struct xchk_iscan *iscan, + struct xfs_perag **pagp, + struct xfs_buf **agi_bpp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + xfs_agino_t agino; + int ret; + + ASSERT(iscan->cursor_ino >= iscan->__visited_ino); + + do { + if (xchk_iscan_aborted(iscan)) + return -ECANCELED; + + agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino); + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ECANCELED; + + ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); + if (ret) + goto out_pag; + + agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); + ret = xchk_iscan_find_next(iscan, agi_bp, pag, &agino); + if (ret) + goto out_buf; + + if (agino != NULLAGINO) { + /* + * Found the next inode in this AG, so return it along + * with the AGI buffer and the perag structure to + * ensure it cannot go away. + */ + xchk_iscan_move_cursor(iscan, agno, agino); + *agi_bpp = agi_bp; + *pagp = pag; + return 1; + } + + /* + * Did not find any more inodes in this AG, move on to the next + * AG. + */ + xchk_iscan_move_cursor(iscan, ++agno, 0); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + trace_xchk_iscan_advance_ag(iscan); + } while (agno < mp->m_sb.sb_agcount); + + xchk_iscan_finish(iscan); + return 0; + +out_buf: + xfs_trans_brelse(sc->tp, agi_bp); +out_pag: + xfs_perag_put(pag); + return ret; +} + +/* + * Grabbing the inode failed, so we need to back up the scan and ask the caller + * to try to _advance the scan again. Returns -EBUSY if we've run out of retry + * opportunities, -ECANCELED if the process has a fatal signal pending, or + * -EAGAIN if we should try again. + */ +STATIC int +xchk_iscan_iget_retry( + struct xchk_iscan *iscan, + bool wait) +{ + ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1); + + if (!iscan->iget_timeout || + time_is_before_jiffies(iscan->__iget_deadline)) + return -EBUSY; + + if (wait) { + unsigned long relax; + + /* + * Sleep for a period of time to let the rest of the system + * catch up. If we return early, someone sent a kill signal to + * the calling process. + */ + relax = msecs_to_jiffies(iscan->iget_retry_delay); + trace_xchk_iscan_iget_retry_wait(iscan); + + if (schedule_timeout_killable(relax) || + xchk_iscan_aborted(iscan)) + return -ECANCELED; + } + + iscan->cursor_ino--; + return -EAGAIN; +} + +/* + * Grab an inode as part of an inode scan. While scanning this inode, the + * caller must ensure that no other threads can modify the inode until a call + * to xchk_iscan_visit succeeds. + * + * Returns 0 and an incore inode; -EAGAIN if the caller should call again + * xchk_iscan_advance; -EBUSY if we couldn't grab an inode; -ECANCELED if + * there's a fatal signal pending; or some other negative errno. + */ +STATIC int +xchk_iscan_iget( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_buf *agi_bp, + struct xfs_inode **ipp) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + int error; + + error = xfs_iget(sc->mp, sc->tp, iscan->cursor_ino, XFS_IGET_NORETRY, + 0, ipp); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + + trace_xchk_iscan_iget(iscan, error); + + if (error == -ENOENT || error == -EAGAIN) { + /* + * It's possible that this inode has lost all of its links but + * hasn't yet been inactivated. If we don't have a transaction + * or it's not writable, flush the inodegc workers and wait. + */ + xfs_inodegc_flush(mp); + return xchk_iscan_iget_retry(iscan, true); + } + + if (error == -EINVAL) { + /* + * We thought the inode was allocated, but the inode btree + * lookup failed, which means that it was freed since the last + * time we advanced the cursor. Back up and try again. This + * should never happen since still hold the AGI buffer from the + * inobt check, but we need to be careful about infinite loops. + */ + return xchk_iscan_iget_retry(iscan, false); + } + + return error; +} + +/* + * Advance the inode scan cursor to the next allocated inode and return the + * incore inode structure associated with it. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be + * grabbed, or the usual negative errno. + * + * If the function returns -EBUSY and the caller can handle skipping an inode, + * it may call this function again to continue the scan with the next allocated + * inode. + */ +int +xchk_iscan_iter( + struct xchk_iscan *iscan, + struct xfs_inode **ipp) +{ + struct xfs_scrub *sc = iscan->sc; + int ret; + + if (iscan->iget_timeout) + iscan->__iget_deadline = jiffies + + msecs_to_jiffies(iscan->iget_timeout); + + do { + struct xfs_buf *agi_bp = NULL; + struct xfs_perag *pag = NULL; + + ret = xchk_iscan_advance(iscan, &pag, &agi_bp); + if (ret != 1) + return ret; + + if (xchk_iscan_aborted(iscan)) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + ret = -ECANCELED; + break; + } + + ret = xchk_iscan_iget(iscan, pag, agi_bp, ipp); + } while (ret == -EAGAIN); + + if (!ret) + return 1; + + return ret; +} + + +/* Mark this inode scan finished and release resources. */ +void +xchk_iscan_teardown( + struct xchk_iscan *iscan) +{ + xchk_iscan_finish(iscan); + mutex_destroy(&iscan->lock); +} + +/* + * Set ourselves up to start an inode scan. If the @iget_timeout and + * @iget_retry_delay parameters are set, the scan will try to iget each inode + * for @iget_timeout milliseconds. If an iget call indicates that the inode is + * waiting to be inactivated, the CPU will relax for @iget_retry_delay + * milliseconds after pushing the inactivation workers. + */ +void +xchk_iscan_start( + struct xfs_scrub *sc, + unsigned int iget_timeout, + unsigned int iget_retry_delay, + struct xchk_iscan *iscan) +{ + iscan->sc = sc; + clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); + iscan->iget_timeout = iget_timeout; + iscan->iget_retry_delay = iget_retry_delay; + iscan->__visited_ino = 0; + iscan->cursor_ino = 0; + mutex_init(&iscan->lock); + + trace_xchk_iscan_start(iscan); +} + +/* + * Mark this inode as having been visited. Callers must hold a sufficiently + * exclusive lock on the inode to prevent concurrent modifications. + */ +void +xchk_iscan_mark_visited( + struct xchk_iscan *iscan, + struct xfs_inode *ip) +{ + mutex_lock(&iscan->lock); + iscan->__visited_ino = ip->i_ino; + trace_xchk_iscan_visit(iscan); + mutex_unlock(&iscan->lock); +} + +/* + * Do we need a live update for this inode? This is true if the scanner thread + * has visited this inode and the scan hasn't been aborted due to errors. + * Callers must hold a sufficiently exclusive lock on the inode to prevent + * scanners from reading any inode metadata. + */ +bool +xchk_iscan_want_live_update( + struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + bool ret; + + if (xchk_iscan_aborted(iscan)) + return false; + + mutex_lock(&iscan->lock); + trace_xchk_iscan_want_live_update(iscan, ino); + ret = iscan->__visited_ino >= ino; + mutex_unlock(&iscan->lock); + + return ret; +} diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h new file mode 100644 index 000000000000..c25f121859ce --- /dev/null +++ b/fs/xfs/scrub/iscan.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_ISCAN_H__ +#define __XFS_SCRUB_ISCAN_H__ + +struct xchk_iscan { + struct xfs_scrub *sc; + + /* Lock to protect the scan cursor. */ + struct mutex lock; + + /* This is the inode that will be examined next. */ + xfs_ino_t cursor_ino; + + /* + * This is the last inode that we've successfully scanned, either + * because the caller scanned it, or we moved the cursor past an empty + * part of the inode address space. Scan callers should only use the + * xchk_iscan_visit function to modify this. + */ + xfs_ino_t __visited_ino; + + /* Operational state of the livescan. */ + unsigned long __opstate; + + /* Give up on iterating @cursor_ino if we can't iget it by this time. */ + unsigned long __iget_deadline; + + /* Amount of time (in ms) that we will try to iget an inode. */ + unsigned int iget_timeout; + + /* Wait this many ms to retry an iget. */ + unsigned int iget_retry_delay; +}; + +/* Set if the scan has been aborted due to some event in the fs. */ +#define XCHK_ISCAN_OPSTATE_ABORTED (1) + +static inline bool +xchk_iscan_aborted(const struct xchk_iscan *iscan) +{ + return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +static inline void +xchk_iscan_abort(struct xchk_iscan *iscan) +{ + set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); +} + +void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, + unsigned int iget_retry_delay, struct xchk_iscan *iscan); +void xchk_iscan_teardown(struct xchk_iscan *iscan); + +int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); + +void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip); +bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino); + +#endif /* __XFS_SCRUB_ISCAN_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index d0e24ffaf754..4542eeebab6f 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -20,6 +20,7 @@ #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" +#include "scrub/iscan.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index ae6b2385a8cb..29026d1d9293 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -16,10 +16,12 @@ #include #include "xfs_bit.h" +struct xfs_scrub; struct xfile; struct xfarray; struct xfarray_sortinfo; struct xchk_dqiter; +struct xchk_iscan; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -1146,6 +1148,110 @@ TRACE_EVENT(xchk_rtsum_record_free, ); #endif /* CONFIG_XFS_RT */ +DECLARE_EVENT_CLASS(xchk_iscan_class, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited) +) +#define DEFINE_ISCAN_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_class, name, \ + TP_PROTO(struct xchk_iscan *iscan), \ + TP_ARGS(iscan)) +DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor); +DEFINE_ISCAN_EVENT(xchk_iscan_visit); +DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag); +DEFINE_ISCAN_EVENT(xchk_iscan_start); + +DECLARE_EVENT_CLASS(xchk_iscan_ino_class, + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), + TP_ARGS(iscan, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->ino = ino; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->ino) +) +#define DEFINE_ISCAN_INO_EVENT(name) \ +DEFINE_EVENT(xchk_iscan_ino_class, name, \ + TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \ + TP_ARGS(iscan, ino)) +DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update); + +TRACE_EVENT(xchk_iscan_iget, + TP_PROTO(struct xchk_iscan *iscan, int error), + TP_ARGS(iscan, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->error = error; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->error) +); + +TRACE_EVENT(xchk_iscan_iget_retry_wait, + TP_PROTO(struct xchk_iscan *iscan), + TP_ARGS(iscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, retry_delay) + __field(unsigned long, remaining) + __field(unsigned int, iget_timeout) + ), + TP_fast_assign( + __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->retry_delay = iscan->iget_retry_delay; + __entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies); + __entry->iget_timeout = iscan->iget_timeout; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->remaining, + __entry->iget_timeout, + __entry->retry_delay) +); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) -- cgit v1.2.3 From 4e98cc905c0fec337416e9fd7ca4f75607a6de99 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:45 -0800 Subject: xfs: allow scrub to hook metadata updates in other writers Certain types of filesystem metadata can only be checked by scanning every file in the entire filesystem. Specific examples of this include quota counts, file link counts, and reverse mappings of file extents. Directory and parent pointer reconstruction may also fall into this category. File scanning is much trickier than scanning AG metadata because we have to take inode locks in the same order as the rest of [VX]FS, we can't be holding buffer locks when we do that, and scanning the whole filesystem takes time. Earlier versions of the online repair patchset relied heavily on fsfreeze as a means to quiesce the filesystem so that we could take locks in the proper order without worrying about concurrent updates from other writers. Reviewers of those patches opined that freezing the entire fs to check and repair something was not sufficiently better than unmounting to run fsck offline. I don't agree with that 100%, but the message was clear: find a way to repair things that minimizes the quiet period where nobody can write to the filesystem. Generally, building btree indexes online can be split into two phases: a collection phase where we compute the records that will be put into the new btree; and a construction phase, where we construct the physical btree blocks and persist them. While it's simple to hold resource locks for the entirety of the two phases to ensure that the new index is consistent with the rest of the system, we don't need to hold resource locks during the collection phase if we have a means to receive live updates of other work going on elsewhere in the system. The goal of this patch, then, is to enable online fsck to learn about metadata updates going on in other threads while it constructs a shadow copy of the metadata records to verify or correct the real metadata. To minimize the overhead when online fsck isn't running, we use srcu notifiers because they prioritize fast access to the notifier call chain (particularly when the chain is empty) at a cost to configuring notifiers. Online fsck should be relatively infrequent, so this is acceptable. The intended usage model is fairly simple. Code that modifies a metadata structure of interest should declare a xfs_hook_chain structure in some well defined place, and call xfs_hook_call whenever an update happens. Online fsck code should define a struct notifier_block and use xfs_hook_add to attach the block to the chain, along with a function to be called. This function should synchronize with the fsck scanner to update whatever in-memory data the scanner is collecting. When finished, xfs_hook_del removes the notifier from the list and waits for them all to complete. Originally, I selected srcu notifiers over blocking notifiers to implement live hooks because they seemed to have fewer impacts to scalability. The per-call cost of srcu_notifier_call_chain is higher (19ns) than blocking_notifier_ (4ns) in the single threaded case, but blocking notifiers use an rwsem to stabilize the list. Cacheline bouncing for that rwsem is costly to runtime code when there are a lot of CPUs running regular filesystem operations. If there are no hooks installed, this is a total waste of CPU time. Therefore, I stuck with srcu notifiers, despite trading off single threaded performance for multithreaded performance. I also wasn't thrilled with the very high teardown time for srcu notifiers, since the caller has to wait for the next rcu grace period. This can take a long time if there are a lot of CPUs. Then I discovered the jump label implementation of static keys. Jump labels use kernel code patching to replace a branch with a nop sled when the key is disabled. IOWs, they can eliminate the overhead of _call_chain when there are no hooks enabled. This makes blocking notifiers competitive again -- scrub runs faster because teardown of the chain is a lot cheaper, and runtime code only pays the rwsem locking overhead when scrub is actually running. With jump labels enabled, calls to empty notifier chains are elided from the call sites when there are no hooks registered, which means that the overhead is 0.36ns when fsck is not running. This is perfect for most of the architectures that XFS is expected to run on (e.g. x86, powerpc, arm64, s390x, riscv). For architectures that don't support jump labels (e.g. m68k) the runtime overhead of checking the static key is an atomic counter read. This isn't great, but it's still cheaper than taking a shared rwsem. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Kconfig | 5 +++++ fs/xfs/Makefile | 1 + fs/xfs/xfs_hooks.c | 52 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_hooks.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_linux.h | 1 + 5 files changed, 124 insertions(+) create mode 100644 fs/xfs/xfs_hooks.c create mode 100644 fs/xfs/xfs_hooks.h diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 567fb37274d3..fa7eb3e2a248 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -124,11 +124,16 @@ config XFS_DRAIN_INTENTS bool select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL +config XFS_LIVE_HOOKS + bool + select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n depends on XFS_FS depends on TMPFS && SHMEM + select XFS_LIVE_HOOKS select XFS_DRAIN_INTENTS help If you say Y here you will be able to check metadata on a diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 17af5bde5308..4597c0f19e8e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -136,6 +136,7 @@ xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o endif xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o +xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c new file mode 100644 index 000000000000..a58d1de2d37d --- /dev/null +++ b/fs/xfs/xfs_hooks.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_ag.h" +#include "xfs_trace.h" + +/* Initialize a notifier chain. */ +void +xfs_hooks_init( + struct xfs_hooks *chain) +{ + BLOCKING_INIT_NOTIFIER_HEAD(&chain->head); +} + +/* Make it so a function gets called whenever we hit a certain hook point. */ +int +xfs_hooks_add( + struct xfs_hooks *chain, + struct xfs_hook *hook) +{ + ASSERT(hook->nb.notifier_call != NULL); + BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0); + + return blocking_notifier_chain_register(&chain->head, &hook->nb); +} + +/* Remove a previously installed hook. */ +void +xfs_hooks_del( + struct xfs_hooks *chain, + struct xfs_hook *hook) +{ + blocking_notifier_chain_unregister(&chain->head, &hook->nb); +} + +/* Call a hook. Returns the NOTIFY_* value returned by the last hook. */ +int +xfs_hooks_call( + struct xfs_hooks *chain, + unsigned long val, + void *priv) +{ + return blocking_notifier_call_chain(&chain->head, val, priv); +} diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h new file mode 100644 index 000000000000..60b8a5831536 --- /dev/null +++ b/fs/xfs/xfs_hooks.h @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef XFS_HOOKS_H_ +#define XFS_HOOKS_H_ + +#ifdef CONFIG_XFS_LIVE_HOOKS +struct xfs_hooks { + struct blocking_notifier_head head; +}; + +/* + * If jump labels are enabled in Kconfig, the static key uses nop sleds and + * code patching to eliminate the overhead of taking the rwsem in + * blocking_notifier_call_chain when there are no hooks configured. If not, + * the static key per-call overhead is an atomic read. Most arches that can + * handle XFS also support jump labels. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \ + static DEFINE_STATIC_KEY_FALSE(name) +# define xfs_hooks_switch_on(name) static_branch_inc(name) +# define xfs_hooks_switch_off(name) static_branch_dec(name) +# define xfs_hooks_switched_on(name) static_branch_unlikely(name) + +struct xfs_hook { + /* This must come at the start of the structure. */ + struct notifier_block nb; +}; + +typedef int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action, + void *data); + +void xfs_hooks_init(struct xfs_hooks *chain); +int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook); +void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook); +int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action, + void *priv); + +static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn) +{ + hook->nb.notifier_call = fn; + hook->nb.priority = 0; +} + +#else + +struct xfs_hooks { /* empty */ }; + +# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) +# define xfs_hooks_switch_on(name) ((void)0) +# define xfs_hooks_switch_off(name) ((void)0) +# define xfs_hooks_switched_on(name) (false) + +# define xfs_hooks_init(chain) ((void)0) +# define xfs_hooks_call(chain, val, priv) (NOTIFY_DONE) +#endif + +#endif /* XFS_HOOKS_H_ */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 75245932e0ad..8f07c9f6157f 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -81,6 +81,7 @@ typedef __u32 xfs_nlink_t; #include "xfs_buf.h" #include "xfs_message.h" #include "xfs_drain.h" +#include "xfs_hooks.h" #ifdef __BIG_ENDIAN #define XFS_NATIVE_HOST 1 -- cgit v1.2.3 From c473a3320be32b2273042bfdf0fe8db5da7ae5d0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:46 -0800 Subject: xfs: stagger the starting AG of scrub iscans to reduce contention Online directory and parent repairs on parent-pointer equipped filesystems have shown that starting a large number of parallel iscans causes a lot of AGI buffer contention. Try to reduce this by making it so that iscans scan wrap around the end of the filesystem, and using a rotor to stagger where each scanner begins. Surprisingly, this boosts CPU utilization (on the author's test machines) from effectively single-threaded to 160%. Not great, but see the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/iscan.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++------ fs/xfs/scrub/iscan.h | 7 +++++ fs/xfs/scrub/trace.h | 7 +++-- 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index d13fc3b60f2e..3179c299c77f 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -170,10 +170,24 @@ xchk_iscan_move_cursor( { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; + xfs_ino_t cursor, visited; + + BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO); + + /* + * Special-case ino == 0 here so that we never set visited_ino to + * NULLFSINO when wrapping around EOFS, for that will let through all + * live updates. + */ + cursor = XFS_AGINO_TO_INO(mp, agno, agino); + if (cursor == 0) + visited = XFS_MAXINUMBER; + else + visited = cursor - 1; mutex_lock(&iscan->lock); - iscan->cursor_ino = XFS_AGINO_TO_INO(mp, agno, agino); - iscan->__visited_ino = iscan->cursor_ino - 1; + iscan->cursor_ino = cursor; + iscan->__visited_ino = visited; trace_xchk_iscan_move_cursor(iscan); mutex_unlock(&iscan->lock); } @@ -257,12 +271,13 @@ xchk_iscan_advance( * Did not find any more inodes in this AG, move on to the next * AG. */ - xchk_iscan_move_cursor(iscan, ++agno, 0); + agno = (agno + 1) % mp->m_sb.sb_agcount; + xchk_iscan_move_cursor(iscan, agno, 0); xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); trace_xchk_iscan_advance_ag(iscan); - } while (agno < mp->m_sb.sb_agcount); + } while (iscan->cursor_ino != iscan->scan_start_ino); xchk_iscan_finish(iscan); return 0; @@ -420,6 +435,23 @@ xchk_iscan_teardown( mutex_destroy(&iscan->lock); } +/* Pick an AG from which to start a scan. */ +static inline xfs_ino_t +xchk_iscan_rotor( + struct xfs_mount *mp) +{ + static atomic_t agi_rotor; + unsigned int r = atomic_inc_return(&agi_rotor) - 1; + + /* + * Rotoring *backwards* through the AGs, so we add one here before + * subtracting from the agcount to arrive at an AG number. + */ + r = (r % mp->m_sb.sb_agcount) + 1; + + return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0); +} + /* * Set ourselves up to start an inode scan. If the @iget_timeout and * @iget_retry_delay parameters are set, the scan will try to iget each inode @@ -434,15 +466,20 @@ xchk_iscan_start( unsigned int iget_retry_delay, struct xchk_iscan *iscan) { + xfs_ino_t start_ino; + + start_ino = xchk_iscan_rotor(sc->mp); + iscan->sc = sc; clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); iscan->iget_timeout = iget_timeout; iscan->iget_retry_delay = iget_retry_delay; - iscan->__visited_ino = 0; - iscan->cursor_ino = 0; + iscan->__visited_ino = start_ino; + iscan->cursor_ino = start_ino; + iscan->scan_start_ino = start_ino; mutex_init(&iscan->lock); - trace_xchk_iscan_start(iscan); + trace_xchk_iscan_start(iscan, start_ino); } /* @@ -471,15 +508,45 @@ xchk_iscan_want_live_update( struct xchk_iscan *iscan, xfs_ino_t ino) { - bool ret; + bool ret = false; if (xchk_iscan_aborted(iscan)) return false; mutex_lock(&iscan->lock); + trace_xchk_iscan_want_live_update(iscan, ino); - ret = iscan->__visited_ino >= ino; - mutex_unlock(&iscan->lock); + /* Scan is finished, caller should receive all updates. */ + if (iscan->__visited_ino == NULLFSINO) { + ret = true; + goto unlock; + } + + /* + * The visited cursor hasn't yet wrapped around the end of the FS. If + * @ino is inside the starred range, the caller should receive updates: + * + * 0 ------------ S ************ V ------------ EOFS + */ + if (iscan->scan_start_ino <= iscan->__visited_ino) { + if (ino >= iscan->scan_start_ino && + ino <= iscan->__visited_ino) + ret = true; + + goto unlock; + } + + /* + * The visited cursor wrapped around the end of the FS. If @ino is + * inside the starred range, the caller should receive updates: + * + * 0 ************ V ------------ S ************ EOFS + */ + if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino) + ret = true; + +unlock: + mutex_unlock(&iscan->lock); return ret; } diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index c25f121859ce..0db97d98ee8d 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -12,6 +12,13 @@ struct xchk_iscan { /* Lock to protect the scan cursor. */ struct mutex lock; + /* + * This is the first inode in the inumber address space that we + * examined. When the scan wraps around back to here, the scan is + * finished. + */ + xfs_ino_t scan_start_ino; + /* This is the inode that will be examined next. */ xfs_ino_t cursor_ino; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 29026d1d9293..5a70968bc3e2 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1173,25 +1173,27 @@ DEFINE_EVENT(xchk_iscan_class, name, \ DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor); DEFINE_ISCAN_EVENT(xchk_iscan_visit); DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag); -DEFINE_ISCAN_EVENT(xchk_iscan_start); DECLARE_EVENT_CLASS(xchk_iscan_ino_class, TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), TP_ARGS(iscan, ino), TP_STRUCT__entry( __field(dev_t, dev) + __field(xfs_ino_t, startino) __field(xfs_ino_t, cursor) __field(xfs_ino_t, visited) __field(xfs_ino_t, ino) ), TP_fast_assign( __entry->dev = iscan->sc->mp->m_super->s_dev; + __entry->startino = iscan->scan_start_ino; __entry->cursor = iscan->cursor_ino; __entry->visited = iscan->__visited_ino; __entry->ino = ino; ), - TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx ino 0x%llx", + TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->startino, __entry->cursor, __entry->visited, __entry->ino) @@ -1201,6 +1203,7 @@ DEFINE_EVENT(xchk_iscan_ino_class, name, \ TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \ TP_ARGS(iscan, ino)) DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update); +DEFINE_ISCAN_INO_EVENT(xchk_iscan_start); TRACE_EVENT(xchk_iscan_iget, TP_PROTO(struct xchk_iscan *iscan, int error), -- cgit v1.2.3 From a7a686cb07203fc42a38e66324241b7f2fe4fae2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:47 -0800 Subject: xfs: cache a bunch of inodes for repair scans After observing xfs_scrub taking forever to rebuild parent pointers on a pptrs enabled filesystem, I decided to profile what the system was doing. It turns out that when there are a lot of threads trying to scan the filesystem, most of our time is spent contending on AGI buffer locks. Given that we're walking the inobt records anyway, we can often tell ahead of time when there's a bunch of (up to 64) consecutive inodes that we could grab all at once. Do this to amortize the cost of taking the AGI lock across as many inodes as we possibly can. On the author's system this seems to improve parallel throughput from barely one and a half cores to slightly sublinear scaling. The obvious antipattern here of course is where the freemask has every other bit set (e.g. all 0xA's) Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/iscan.c | 159 +++++++++++++++++++++++++++++++++++++++++---------- fs/xfs/scrub/iscan.h | 7 +++ fs/xfs/scrub/trace.h | 23 ++++++++ 3 files changed, 159 insertions(+), 30 deletions(-) diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index 3179c299c77f..d1c33aba1f10 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -60,6 +60,7 @@ xchk_iscan_find_next( struct xchk_iscan *iscan, struct xfs_buf *agi_bp, struct xfs_perag *pag, + xfs_inofree_t *allocmaskp, xfs_agino_t *cursor) { struct xfs_scrub *sc = iscan->sc; @@ -145,6 +146,7 @@ xchk_iscan_find_next( ASSERT(next >= 0); *cursor = rec.ir_startino + next; + *allocmaskp = allocmask >> next; break; } } @@ -225,7 +227,8 @@ STATIC int xchk_iscan_advance( struct xchk_iscan *iscan, struct xfs_perag **pagp, - struct xfs_buf **agi_bpp) + struct xfs_buf **agi_bpp, + xfs_inofree_t *allocmaskp) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; @@ -251,7 +254,8 @@ xchk_iscan_advance( goto out_pag; agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); - ret = xchk_iscan_find_next(iscan, agi_bp, pag, &agino); + ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp, + &agino); if (ret) goto out_buf; @@ -331,29 +335,35 @@ xchk_iscan_iget_retry( * caller must ensure that no other threads can modify the inode until a call * to xchk_iscan_visit succeeds. * - * Returns 0 and an incore inode; -EAGAIN if the caller should call again - * xchk_iscan_advance; -EBUSY if we couldn't grab an inode; -ECANCELED if - * there's a fatal signal pending; or some other negative errno. + * Returns the number of incore inodes grabbed; -EAGAIN if the caller should + * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode; + * -ECANCELED if there's a fatal signal pending; or some other negative errno. */ STATIC int xchk_iscan_iget( struct xchk_iscan *iscan, struct xfs_perag *pag, struct xfs_buf *agi_bp, - struct xfs_inode **ipp) + xfs_inofree_t allocmask) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; + xfs_ino_t ino = iscan->cursor_ino; + unsigned int idx = 0; int error; - error = xfs_iget(sc->mp, sc->tp, iscan->cursor_ino, XFS_IGET_NORETRY, - 0, ipp); - xfs_trans_brelse(sc->tp, agi_bp); - xfs_perag_put(pag); + ASSERT(iscan->__inodes[0] == NULL); + + /* Fill the first slot in the inode array. */ + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); trace_xchk_iscan_iget(iscan, error); if (error == -ENOENT || error == -EAGAIN) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + /* * It's possible that this inode has lost all of its links but * hasn't yet been inactivated. If we don't have a transaction @@ -364,6 +374,9 @@ xchk_iscan_iget( } if (error == -EINVAL) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + /* * We thought the inode was allocated, but the inode btree * lookup failed, which means that it was freed since the last @@ -374,25 +387,47 @@ xchk_iscan_iget( return xchk_iscan_iget_retry(iscan, false); } - return error; + if (error) { + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return error; + } + idx++; + ino++; + allocmask >>= 1; + + /* + * Now that we've filled the first slot in __inodes, try to fill the + * rest of the batch with consecutively ordered inodes. to reduce the + * number of _iter calls. If we can't get an inode, we stop and return + * what we have. + */ + for (; allocmask & 1; allocmask >>= 1, ino++, idx++) { + ASSERT(iscan->__inodes[idx] == NULL); + + error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, + &iscan->__inodes[idx]); + if (error) + break; + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + mutex_unlock(&iscan->lock); + } + + trace_xchk_iscan_iget_batch(sc->mp, iscan, idx); + xfs_trans_brelse(sc->tp, agi_bp); + xfs_perag_put(pag); + return idx; } /* - * Advance the inode scan cursor to the next allocated inode and return the - * incore inode structure associated with it. - * - * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, - * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be - * grabbed, or the usual negative errno. - * - * If the function returns -EBUSY and the caller can handle skipping an inode, - * it may call this function again to continue the scan with the next allocated - * inode. + * Advance the inode scan cursor to the next allocated inode and return up to + * 64 consecutive allocated inodes starting with the cursor position. */ -int -xchk_iscan_iter( - struct xchk_iscan *iscan, - struct xfs_inode **ipp) +STATIC int +xchk_iscan_iter_batch( + struct xchk_iscan *iscan) { struct xfs_scrub *sc = iscan->sc; int ret; @@ -404,8 +439,9 @@ xchk_iscan_iter( do { struct xfs_buf *agi_bp = NULL; struct xfs_perag *pag = NULL; + xfs_inofree_t allocmask = 0; - ret = xchk_iscan_advance(iscan, &pag, &agi_bp); + ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask); if (ret != 1) return ret; @@ -416,21 +452,74 @@ xchk_iscan_iter( break; } - ret = xchk_iscan_iget(iscan, pag, agi_bp, ipp); + ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask); } while (ret == -EAGAIN); - if (!ret) - return 1; - return ret; } +/* + * Advance the inode scan cursor to the next allocated inode and return the + * incore inode structure associated with it. + * + * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes, + * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be + * grabbed, or the usual negative errno. + * + * If the function returns -EBUSY and the caller can handle skipping an inode, + * it may call this function again to continue the scan with the next allocated + * inode. + */ +int +xchk_iscan_iter( + struct xchk_iscan *iscan, + struct xfs_inode **ipp) +{ + unsigned int i; + int error; + + /* Find a cached inode, or go get another batch. */ + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) + goto foundit; + } + + error = xchk_iscan_iter_batch(iscan); + if (error <= 0) + return error; + + ASSERT(iscan->__inodes[0] != NULL); + i = 0; + +foundit: + /* Give the caller our reference. */ + *ipp = iscan->__inodes[i]; + iscan->__inodes[i] = NULL; + return 1; +} + +/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */ +void +xchk_iscan_iter_finish( + struct xchk_iscan *iscan) +{ + struct xfs_scrub *sc = iscan->sc; + unsigned int i; + + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (iscan->__inodes[i]) { + xchk_irele(sc, iscan->__inodes[i]); + iscan->__inodes[i] = NULL; + } + } +} /* Mark this inode scan finished and release resources. */ void xchk_iscan_teardown( struct xchk_iscan *iscan) { + xchk_iscan_iter_finish(iscan); xchk_iscan_finish(iscan); mutex_destroy(&iscan->lock); } @@ -478,6 +567,7 @@ xchk_iscan_start( iscan->cursor_ino = start_ino; iscan->scan_start_ino = start_ino; mutex_init(&iscan->lock); + memset(iscan->__inodes, 0, sizeof(iscan->__inodes)); trace_xchk_iscan_start(iscan, start_ino); } @@ -523,6 +613,15 @@ xchk_iscan_want_live_update( goto unlock; } + /* + * No inodes have been visited yet, so the visited cursor points at the + * start of the scan range. The caller should not receive any updates. + */ + if (iscan->scan_start_ino == iscan->__visited_ino) { + ret = false; + goto unlock; + } + /* * The visited cursor hasn't yet wrapped around the end of the FS. If * @ino is inside the starred range, the caller should receive updates: diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index 0db97d98ee8d..f7317af807dd 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -41,6 +41,12 @@ struct xchk_iscan { /* Wait this many ms to retry an iget. */ unsigned int iget_retry_delay; + + /* + * The scan grabs batches of inodes and stashes them here before + * handing them out with _iter. + */ + struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK]; }; /* Set if the scan has been aborted due to some event in the fs. */ @@ -63,6 +69,7 @@ void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout, void xchk_iscan_teardown(struct xchk_iscan *iscan); int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp); +void xchk_iscan_iter_finish(struct xchk_iscan *iscan); void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip); bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5a70968bc3e2..38d3356466cd 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1227,6 +1227,29 @@ TRACE_EVENT(xchk_iscan_iget, __entry->error) ); +TRACE_EVENT(xchk_iscan_iget_batch, + TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan, + unsigned int nr), + TP_ARGS(mp, iscan, nr), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, cursor) + __field(xfs_ino_t, visited) + __field(unsigned int, nr) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->cursor = iscan->cursor_ino; + __entry->visited = iscan->__visited_ino; + __entry->nr = nr; + ), + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx nr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->cursor, + __entry->visited, + __entry->nr) +); + TRACE_EVENT(xchk_iscan_iget_retry_wait, TP_PROTO(struct xchk_iscan *iscan), TP_ARGS(iscan), -- cgit v1.2.3 From 82334a79c6eb1c18b4b38285cf2693bbc5db3933 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:48 -0800 Subject: xfs: iscan batching should handle unallocated inodes too The inode scanner tries to reduce contention on the AGI header buffer lock by grabbing references to consecutive allocated inodes. Batching stops as soon as we encounter an unallocated inode. This is unfortunate because in the worst case performance collapses to the old "one at a time" behavior if every other inode is free. This is correct behavior, but we could do better. Unallocated inodes by definition have nothing to scan, which means the iscan can ignore them as long as someone ensures that the scan data will reflect another thread allocating the inode and adding interesting metadata to that inode. That mechanism is, of course, the live update hooks. Therefore, extend the batching mechanism to track unallocated inodes adjacent to the scan cursor. The _want_live_update predicate can tell the caller's live update hook to incorporate all live updates to what the scanner thinks is an unallocated inode if (after dropping the AGI) some other thread allocates one of those inodes and begins using it. Note that we cannot just copy the ir_free bitmap into the scan cursor because the batching stops if iget says the inode is in an intermediate state (e.g. on the inactivation list) and cannot be igrabbed. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/iscan.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++----- fs/xfs/scrub/iscan.h | 6 ++- fs/xfs/scrub/trace.h | 21 ++++++++-- 3 files changed, 119 insertions(+), 15 deletions(-) diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index d1c33aba1f10..327f7fb83958 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -61,7 +61,8 @@ xchk_iscan_find_next( struct xfs_buf *agi_bp, struct xfs_perag *pag, xfs_inofree_t *allocmaskp, - xfs_agino_t *cursor) + xfs_agino_t *cursor, + uint8_t *nr_inodesp) { struct xfs_scrub *sc = iscan->sc; struct xfs_inobt_rec_incore rec; @@ -147,6 +148,7 @@ xchk_iscan_find_next( ASSERT(next >= 0); *cursor = rec.ir_startino + next; *allocmaskp = allocmask >> next; + *nr_inodesp = XFS_INODES_PER_CHUNK - next; break; } } @@ -228,7 +230,8 @@ xchk_iscan_advance( struct xchk_iscan *iscan, struct xfs_perag **pagp, struct xfs_buf **agi_bpp, - xfs_inofree_t *allocmaskp) + xfs_inofree_t *allocmaskp, + uint8_t *nr_inodesp) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; @@ -255,7 +258,7 @@ xchk_iscan_advance( agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino); ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp, - &agino); + &agino, nr_inodesp); if (ret) goto out_buf; @@ -344,12 +347,14 @@ xchk_iscan_iget( struct xchk_iscan *iscan, struct xfs_perag *pag, struct xfs_buf *agi_bp, - xfs_inofree_t allocmask) + xfs_inofree_t allocmask, + uint8_t nr_inodes) { struct xfs_scrub *sc = iscan->sc; struct xfs_mount *mp = sc->mp; xfs_ino_t ino = iscan->cursor_ino; unsigned int idx = 0; + unsigned int i; int error; ASSERT(iscan->__inodes[0] == NULL); @@ -399,10 +404,28 @@ xchk_iscan_iget( /* * Now that we've filled the first slot in __inodes, try to fill the * rest of the batch with consecutively ordered inodes. to reduce the - * number of _iter calls. If we can't get an inode, we stop and return - * what we have. + * number of _iter calls. Make a bitmap of unallocated inodes from the + * zeroes in the inuse bitmap; these inodes will not be scanned, but + * the _want_live_update predicate will pass through all live updates. + * + * If we can't iget an allocated inode, stop and return what we have. */ - for (; allocmask & 1; allocmask >>= 1, ino++, idx++) { + mutex_lock(&iscan->lock); + iscan->__batch_ino = ino - 1; + iscan->__skipped_inomask = 0; + mutex_unlock(&iscan->lock); + + for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) { + if (!(allocmask & 1)) { + ASSERT(!(iscan->__skipped_inomask & (1ULL << i))); + + mutex_lock(&iscan->lock); + iscan->cursor_ino = ino; + iscan->__skipped_inomask |= (1ULL << i); + mutex_unlock(&iscan->lock); + continue; + } + ASSERT(iscan->__inodes[idx] == NULL); error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0, @@ -413,14 +436,42 @@ xchk_iscan_iget( mutex_lock(&iscan->lock); iscan->cursor_ino = ino; mutex_unlock(&iscan->lock); + idx++; } - trace_xchk_iscan_iget_batch(sc->mp, iscan, idx); + trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx); xfs_trans_brelse(sc->tp, agi_bp); xfs_perag_put(pag); return idx; } +/* + * Advance the visit cursor to reflect skipped inodes beyond whatever we + * scanned. + */ +STATIC void +xchk_iscan_finish_batch( + struct xchk_iscan *iscan) +{ + xfs_ino_t highest_skipped; + + mutex_lock(&iscan->lock); + + if (iscan->__batch_ino != NULLFSINO) { + highest_skipped = iscan->__batch_ino + + xfs_highbit64(iscan->__skipped_inomask); + iscan->__visited_ino = max(iscan->__visited_ino, + highest_skipped); + + trace_xchk_iscan_skip(iscan); + } + + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + + mutex_unlock(&iscan->lock); +} + /* * Advance the inode scan cursor to the next allocated inode and return up to * 64 consecutive allocated inodes starting with the cursor position. @@ -432,6 +483,8 @@ xchk_iscan_iter_batch( struct xfs_scrub *sc = iscan->sc; int ret; + xchk_iscan_finish_batch(iscan); + if (iscan->iget_timeout) iscan->__iget_deadline = jiffies + msecs_to_jiffies(iscan->iget_timeout); @@ -440,8 +493,10 @@ xchk_iscan_iter_batch( struct xfs_buf *agi_bp = NULL; struct xfs_perag *pag = NULL; xfs_inofree_t allocmask = 0; + uint8_t nr_inodes = 0; - ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask); + ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask, + &nr_inodes); if (ret != 1) return ret; @@ -452,7 +507,7 @@ xchk_iscan_iter_batch( break; } - ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask); + ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes); } while (ret == -EAGAIN); return ret; @@ -559,6 +614,9 @@ xchk_iscan_start( start_ino = xchk_iscan_rotor(sc->mp); + iscan->__batch_ino = NULLFSINO; + iscan->__skipped_inomask = 0; + iscan->sc = sc; clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate); iscan->iget_timeout = iget_timeout; @@ -587,6 +645,26 @@ xchk_iscan_mark_visited( mutex_unlock(&iscan->lock); } +/* + * Did we skip this inode because it wasn't allocated when we loaded the batch? + * If so, it is newly allocated and will not be scanned. All live updates to + * this inode must be passed to the caller to maintain scan correctness. + */ +static inline bool +xchk_iscan_skipped( + const struct xchk_iscan *iscan, + xfs_ino_t ino) +{ + if (iscan->__batch_ino == NULLFSINO) + return false; + if (ino < iscan->__batch_ino) + return false; + if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK) + return false; + + return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino)); +} + /* * Do we need a live update for this inode? This is true if the scanner thread * has visited this inode and the scan hasn't been aborted due to errors. @@ -622,6 +700,15 @@ xchk_iscan_want_live_update( goto unlock; } + /* + * This inode was not allocated at the time of the iscan batch. + * The caller should receive all updates. + */ + if (xchk_iscan_skipped(iscan, ino)) { + ret = true; + goto unlock; + } + /* * The visited cursor hasn't yet wrapped around the end of the FS. If * @ino is inside the starred range, the caller should receive updates: diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index f7317af807dd..365d54e35cd9 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -44,8 +44,12 @@ struct xchk_iscan { /* * The scan grabs batches of inodes and stashes them here before - * handing them out with _iter. + * handing them out with _iter. Unallocated inodes are set in the + * mask so that all updates to that inode are selected for live + * update propagation. */ + xfs_ino_t __batch_ino; + xfs_inofree_t __skipped_inomask; struct xfs_inode *__inodes[XFS_INODES_PER_CHUNK]; }; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 38d3356466cd..829c90da59c7 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1172,6 +1172,7 @@ DEFINE_EVENT(xchk_iscan_class, name, \ TP_ARGS(iscan)) DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor); DEFINE_ISCAN_EVENT(xchk_iscan_visit); +DEFINE_ISCAN_EVENT(xchk_iscan_skip); DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag); DECLARE_EVENT_CLASS(xchk_iscan_ino_class, @@ -1229,25 +1230,37 @@ TRACE_EVENT(xchk_iscan_iget, TRACE_EVENT(xchk_iscan_iget_batch, TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan, - unsigned int nr), - TP_ARGS(mp, iscan, nr), + unsigned int nr, unsigned int avail), + TP_ARGS(mp, iscan, nr, avail), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, cursor) __field(xfs_ino_t, visited) __field(unsigned int, nr) + __field(unsigned int, avail) + __field(unsigned int, unavail) + __field(xfs_ino_t, batch_ino) + __field(unsigned long long, skipmask) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->cursor = iscan->cursor_ino; __entry->visited = iscan->__visited_ino; __entry->nr = nr; + __entry->avail = avail; + __entry->unavail = hweight64(iscan->__skipped_inomask); + __entry->batch_ino = iscan->__batch_ino; + __entry->skipmask = iscan->__skipped_inomask; ), - TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx nr %d", + TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->cursor, __entry->visited, - __entry->nr) + __entry->batch_ino, + __entry->skipmask, + __entry->nr, + __entry->avail, + __entry->unavail) ); TRACE_EVENT(xchk_iscan_iget_retry_wait, -- cgit v1.2.3 From e99bfc9e687e208d4ba7e85167b8753e80cf4169 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:48 -0800 Subject: xfs: create a static name for the dot entry too Create an xfs_name_dot object so that upcoming scrub code can compare against that. Offline repair already has such an object, so we're really just hoisting it to the kernel. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.c | 6 ++++++ fs/xfs/libxfs/xfs_dir2.h | 1 + 2 files changed, 7 insertions(+) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 8c9403b33191..9018abb38d7f 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -25,6 +25,12 @@ const struct xfs_name xfs_name_dotdot = { .type = XFS_DIR3_FT_DIR, }; +const struct xfs_name xfs_name_dot = { + .name = (const unsigned char *)".", + .len = 1, + .type = XFS_DIR3_FT_DIR, +}; + /* * Convert inode mode to directory entry filetype */ diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 19af22a16c41..7d7cd8d808e4 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -22,6 +22,7 @@ struct xfs_dir3_icfree_hdr; struct xfs_dir3_icleaf_hdr; extern const struct xfs_name xfs_name_dotdot; +extern const struct xfs_name xfs_name_dot; /* * Convert inode mode to directory entry filetype -- cgit v1.2.3 From d9c0775897147bab54410611ac2659a7477c770c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:49 -0800 Subject: xfs: create a predicate to determine if two xfs_names are the same Create a simple predicate to determine if two xfs_names are the same objects or have the exact same name. The comparison is always case sensitive. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_dir2.h | 12 ++++++++++++ fs/xfs/scrub/dir.c | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 7d7cd8d808e4..8497d041f316 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -24,6 +24,18 @@ struct xfs_dir3_icleaf_hdr; extern const struct xfs_name xfs_name_dotdot; extern const struct xfs_name xfs_name_dot; +static inline bool +xfs_dir2_samename( + const struct xfs_name *n1, + const struct xfs_name *n2) +{ + if (n1 == n2) + return true; + if (n1->len != n2->len) + return false; + return !memcmp(n1->name, n2->name, n1->len); +} + /* * Convert inode mode to directory entry filetype */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index d86ab51af928..076a310b8eb0 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -93,11 +93,11 @@ xchk_dir_actor( return -ECANCELED; } - if (!strncmp(".", name->name, name->len)) { + if (xfs_dir2_samename(name, &xfs_name_dot)) { /* If this is "." then check that the inum matches the dir. */ if (ino != dp->i_ino) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); - } else if (!strncmp("..", name->name, name->len)) { + } else if (xfs_dir2_samename(name, &xfs_name_dotdot)) { /* * If this is ".." in the root inode, check that the inum * matches this dir. -- cgit v1.2.3 From 3c79e6a87221e063064e3680946a8b4bcd9fe78d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:50 -0800 Subject: xfs: create a macro for decoding ftypes in tracepoints Create the XFS_DIR3_FTYPE_STR macro so that we can report ftype as strings instead of numbers in tracepoints. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_da_format.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 24f9d1461f9a..060e5c96b70f 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -159,6 +159,17 @@ struct xfs_da3_intnode { #define XFS_DIR3_FT_MAX 9 +#define XFS_DIR3_FTYPE_STR \ + { XFS_DIR3_FT_UNKNOWN, "unknown" }, \ + { XFS_DIR3_FT_REG_FILE, "file" }, \ + { XFS_DIR3_FT_DIR, "directory" }, \ + { XFS_DIR3_FT_CHRDEV, "char" }, \ + { XFS_DIR3_FT_BLKDEV, "block" }, \ + { XFS_DIR3_FT_FIFO, "fifo" }, \ + { XFS_DIR3_FT_SOCK, "sock" }, \ + { XFS_DIR3_FT_SYMLINK, "symlink" }, \ + { XFS_DIR3_FT_WHT, "whiteout" } + /* * Byte offset in data block and shortform entry. */ -- cgit v1.2.3 From 3d8f1426977f1bf10f867bcd26df6518ae6c2b2c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:51 -0800 Subject: xfs: report the health of quota counts Report the health of quota counts. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_fs.h | 1 + fs/xfs/libxfs/xfs_health.h | 4 +++- fs/xfs/xfs_health.c | 1 + fs/xfs/xfs_qm.c | 7 ++++++- fs/xfs/xfs_trans_dquot.c | 2 ++ 5 files changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 6360073865db..711e0fc7efab 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -195,6 +195,7 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_PQUOTA (1 << 3) /* project quota */ #define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */ #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ +#define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 6296993ff8f3..5626e53b3f0f 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -41,6 +41,7 @@ struct xfs_fsop_geom; #define XFS_SICK_FS_UQUOTA (1 << 1) /* user quota */ #define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */ #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ +#define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ /* Observable health issues for realtime volume metadata. */ #define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ @@ -77,7 +78,8 @@ struct xfs_fsop_geom; #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ XFS_SICK_FS_GQUOTA | \ - XFS_SICK_FS_PQUOTA) + XFS_SICK_FS_PQUOTA | \ + XFS_SICK_FS_QUOTACHECK) #define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ XFS_SICK_RT_SUMMARY) diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 9a57afee9338..ef07af9f753d 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -280,6 +280,7 @@ static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_UQUOTA, XFS_FSOP_GEOM_SICK_UQUOTA }, { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA }, { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, + { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b5b555698ae1..dca2f60b2e30 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -26,6 +26,7 @@ #include "xfs_ag.h" #include "xfs_ialloc.h" #include "xfs_log_priv.h" +#include "xfs_health.h" /* * The global quota manager. There is only one of these for the entire @@ -1408,8 +1409,12 @@ error_return: xfs_warn(mp, "Quotacheck: Failed to reset quota flags."); } - } else + xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK); + } else { xfs_notice(mp, "Quotacheck: Done."); + xfs_fs_mark_healthy(mp, XFS_SICK_FS_QUOTACHECK); + } + return error; error_purge: diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 9c159d016ecf..30bde11aa1c2 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,6 +17,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_health.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -706,6 +707,7 @@ error_return: error_corrupt: xfs_dqunlock(dqp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK); return -EFSCORRUPTED; } -- cgit v1.2.3 From 5385f1a60d4e5b73e8ecd2757865352b68f54fb9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:51 -0800 Subject: xfs: repair file modes by scanning for a dirent pointing to us Repair might encounter an inode with a totally garbage i_mode. To fix this problem, we have to figure out if the file was a regular file, a directory, or a special file. One way to figure this out is to check if there are any directories with entries pointing down to the busted file. This patch recovers the file mode by scanning every directory entry on the filesystem to see if there are any that point to the busted file. If the ftype of all such dirents are consistent, the mode is recovered from the ftype. If no dirents are found, the file becomes a regular file. In all cases, ACLs are canceled and the file is made accessible only by root. A previous patch attempted to guess the mode by reading the beginning of the file data. This was rejected by Christoph on the grounds that we cannot trust user-controlled data blocks. Users do not have direct control over the ondisk contents of directory entries, so this method should be much safer. If all the dirents have the same ftype, then we can translate that back into an S_IFMT flag and fix the file. If not, reset the mode to S_IFREG. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/inode_repair.c | 236 ++++++++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/iscan.c | 29 ++++++ fs/xfs/scrub/iscan.h | 3 + fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 49 +++++++++ 5 files changed, 312 insertions(+), 6 deletions(-) diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 0ca62d59f84a..7e859c412a5b 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -43,6 +43,8 @@ #include "scrub/btree.h" #include "scrub/trace.h" #include "scrub/repair.h" +#include "scrub/iscan.h" +#include "scrub/readdir.h" /* * Inode Record Repair @@ -126,6 +128,10 @@ struct xrep_inode { /* Must we remove all access from this file? */ bool zap_acls; + + /* Inode scanner to see if we can find the ftype from dirents */ + struct xchk_iscan ftype_iscan; + uint8_t alleged_ftype; }; /* @@ -227,26 +233,233 @@ xrep_dinode_header( dip->di_gen = cpu_to_be32(sc->sm->sm_gen); } -/* Turn di_mode into /something/ recognizable. */ -STATIC void +/* + * If this directory entry points to the scrub target inode, then the directory + * we're scanning is the parent of the scrub target inode. + */ +STATIC int +xrep_dinode_findmode_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + if (ino != sc->sm->sm_ino) + return 0; + + /* Ignore garbage directory entry names. */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) + return -EFSCORRUPTED; + + /* Don't pick up dot or dotdot entries; we only want child dirents. */ + if (xfs_dir2_samename(name, &xfs_name_dotdot) || + xfs_dir2_samename(name, &xfs_name_dot)) + return 0; + + /* + * Uhoh, more than one parent for this inode and they don't agree on + * the file type? + */ + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN && + ri->alleged_ftype != name->type) { + trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type, + ri->alleged_ftype); + return -EFSCORRUPTED; + } + + /* We found a potential parent; remember the ftype. */ + trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type); + ri->alleged_ftype = name->type; + return 0; +} + +/* + * If this is a directory, walk the dirents looking for any that point to the + * scrub target inode. + */ +STATIC int +xrep_dinode_findmode_walk_directory( + struct xrep_inode *ri, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = ri->sc; + unsigned int lock_mode; + int error = 0; + + /* + * Scan the directory to see if there it contains an entry pointing to + * the directory that we are repairing. + */ + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * If this directory is known to be sick, we cannot scan it reliably + * and must abort. + */ + if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE | + XFS_SICK_INO_BMBTD | + XFS_SICK_INO_DIR)) { + error = -EFSCORRUPTED; + goto out_unlock; + } + + /* + * We cannot complete our parent pointer scan if a directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_unlock; + } + + error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri); + if (error) + goto out_unlock; + +out_unlock: + xfs_iunlock(dp, lock_mode); + return error; +} + +/* + * Try to find the mode of the inode being repaired by looking for directories + * that point down to this file. + */ +STATIC int +xrep_dinode_find_mode( + struct xrep_inode *ri, + uint16_t *mode) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_inode *dp; + int error; + + /* No ftype means we have no other metadata to consult. */ + if (!xfs_has_ftype(sc->mp)) { + *mode = S_IFREG; + return 0; + } + + /* + * Scan all directories for parents that might point down to this + * inode. Skip the inode being repaired during the scan since it + * cannot be its own parent. Note that we still hold the AGI locked + * so there's a real possibility that _iscan_iter can return EBUSY. + */ + xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan); + ri->ftype_iscan.skip_ino = sc->sm->sm_ino; + ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN; + while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) { + if (S_ISDIR(VFS_I(dp)->i_mode)) + error = xrep_dinode_findmode_walk_directory(ri, dp); + xchk_iscan_mark_visited(&ri->ftype_iscan, dp); + xchk_irele(sc, dp); + if (error < 0) + break; + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&ri->ftype_iscan); + xchk_iscan_teardown(&ri->ftype_iscan); + + if (error == -EBUSY) { + if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) { + /* + * If we got an EBUSY after finding at least one + * dirent, that means the scan found an inode on the + * inactivation list and could not open it. Accept the + * alleged ftype and install a new mode below. + */ + error = 0; + } else if (!(sc->flags & XCHK_TRY_HARDER)) { + /* + * Otherwise, retry the operation one time to see if + * the reason for the delay is an inode from the same + * cluster buffer waiting on the inactivation list. + */ + error = -EDEADLOCK; + } + } + if (error) + return error; + + /* + * Convert the discovered ftype into the file mode. If all else fails, + * return S_IFREG. + */ + switch (ri->alleged_ftype) { + case XFS_DIR3_FT_DIR: + *mode = S_IFDIR; + break; + case XFS_DIR3_FT_WHT: + case XFS_DIR3_FT_CHRDEV: + *mode = S_IFCHR; + break; + case XFS_DIR3_FT_BLKDEV: + *mode = S_IFBLK; + break; + case XFS_DIR3_FT_FIFO: + *mode = S_IFIFO; + break; + case XFS_DIR3_FT_SOCK: + *mode = S_IFSOCK; + break; + case XFS_DIR3_FT_SYMLINK: + *mode = S_IFLNK; + break; + default: + *mode = S_IFREG; + break; + } + return 0; +} + +/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */ +STATIC int xrep_dinode_mode( struct xrep_inode *ri, struct xfs_dinode *dip) { struct xfs_scrub *sc = ri->sc; uint16_t mode = be16_to_cpu(dip->di_mode); + int error; trace_xrep_dinode_mode(sc, dip); if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) - return; + return 0; + + /* Try to fix the mode. If we cannot, then leave everything alone. */ + error = xrep_dinode_find_mode(ri, &mode); + switch (error) { + case -EINTR: + case -EBUSY: + case -EDEADLOCK: + /* temporary failure or fatal signal */ + return error; + case 0: + /* found mode */ + break; + default: + /* some other error, assume S_IFREG */ + mode = S_IFREG; + break; + } /* bad mode, so we set it to a file that only root can read */ - mode = S_IFREG; dip->di_mode = cpu_to_be16(mode); dip->di_uid = 0; dip->di_gid = 0; ri->zap_acls = true; + return 0; } /* Fix any conflicting flags that the verifiers complain about. */ @@ -1107,12 +1320,15 @@ xrep_dinode_core( /* Fix everything the verifier will complain about. */ dip = xfs_buf_offset(bp, ri->imap.im_boffset); xrep_dinode_header(sc, dip); - xrep_dinode_mode(ri, dip); + iget_error = xrep_dinode_mode(ri, dip); + if (iget_error) + goto write; xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); xrep_dinode_zap_forks(ri, dip); +write: /* Write out the inode. */ trace_xrep_dinode_fixed(sc, dip); xfs_dinode_calc_crc(sc->mp, dip); @@ -1128,7 +1344,8 @@ xrep_dinode_core( * accessing the inode. If iget fails, we still need to commit the * changes. */ - iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + iget_error = xchk_iget(sc, ino, &sc->ip); if (!iget_error) xchk_ilock(sc, XFS_IOLOCK_EXCL); @@ -1496,6 +1713,13 @@ xrep_inode( ASSERT(ri != NULL); error = xrep_dinode_problems(ri); + if (error == -EBUSY) { + /* + * Directory scan to recover inode mode encountered a + * busy inode, so we did not continue repairing things. + */ + return 0; + } if (error) return error; diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index 327f7fb83958..17af89b519b3 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -51,6 +51,32 @@ * scanner's observations must be updated. */ +/* + * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so + * that the scan ignores that inode. + */ +STATIC void +xchk_iscan_mask_skipino( + struct xchk_iscan *iscan, + struct xfs_perag *pag, + struct xfs_inobt_rec_incore *rec, + xfs_agino_t lastrecino) +{ + struct xfs_scrub *sc = iscan->sc; + struct xfs_mount *mp = sc->mp; + xfs_agnumber_t skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino); + xfs_agnumber_t skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino); + + if (pag->pag_agno != skip_agno) + return; + if (skip_agino < rec->ir_startino) + return; + if (skip_agino > lastrecino) + return; + + rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1); +} + /* * Set *cursor to the next allocated inode after whatever it's set to now. * If there are no more inodes in this AG, cursor is set to NULLAGINO. @@ -127,6 +153,9 @@ xchk_iscan_find_next( if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) continue; + if (iscan->skip_ino) + xchk_iscan_mask_skipino(iscan, pag, &rec, lastino); + /* * If the incoming lookup put us in the middle of an inobt * record, mark it and the previous inodes "free" so that the diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h index 365d54e35cd9..71f657552dfa 100644 --- a/fs/xfs/scrub/iscan.h +++ b/fs/xfs/scrub/iscan.h @@ -22,6 +22,9 @@ struct xchk_iscan { /* This is the inode that will be examined next. */ xfs_ino_t cursor_ino; + /* If nonzero and non-NULL, skip this inode when scanning. */ + xfs_ino_t skip_ino; + /* * This is the last inode that we've successfully scanned, either * because the caller scanned it, or we moved the cursor past an empty diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 4542eeebab6f..5ed75cc33b92 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -16,6 +16,7 @@ #include "xfs_rtbitmap.h" #include "xfs_quota.h" #include "xfs_quota_defs.h" +#include "xfs_da_format.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 829c90da59c7..9aba60c61880 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1817,6 +1817,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps, __entry->attr_extents) ); +TRACE_EVENT(xrep_dinode_findmode_dirent, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype), + TP_ARGS(sc, dp, ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR)) +); + +TRACE_EVENT(xrep_dinode_findmode_dirent_inval, + TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp, + unsigned int ftype, unsigned int found_ftype), + TP_ARGS(sc, dp, ftype, found_ftype), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_ino_t, parent_ino) + __field(unsigned int, ftype) + __field(unsigned int, found_ftype) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->parent_ino = dp->i_ino; + __entry->ftype = ftype; + __entry->found_ftype = found_ftype; + ), + TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parent_ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR)) +); + TRACE_EVENT(xrep_cow_mark_file_range, TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, xfs_fileoff_t startoff, xfs_filblks_t blockcount), -- cgit v1.2.3 From 564fee6d20537eec2be2e74c8d829c654d6d8855 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:52 -0800 Subject: xfs: create a xchk_trans_alloc_empty helper for scrub Create a helper to initialize empty transactions on behalf of a scrub operation. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/common.c | 9 ++++++++- fs/xfs/scrub/common.h | 1 + fs/xfs/scrub/fscounters.c | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 81f2b96bb5a7..47557f2a7995 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -653,6 +653,13 @@ xchk_trans_cancel( sc->tp = NULL; } +int +xchk_trans_alloc_empty( + struct xfs_scrub *sc) +{ + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + /* * Grab an empty transaction so that we can re-grab locked buffers if * one of our btrees turns out to be cyclic. @@ -672,7 +679,7 @@ xchk_trans_alloc( return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, resblks, 0, 0, &sc->tp); - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* Set us up with a transaction and an empty context. */ diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index da09580b454a..d2ca423b02c7 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -32,6 +32,7 @@ xchk_should_terminate( } int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks); +int xchk_trans_alloc_empty(struct xfs_scrub *sc); void xchk_trans_cancel(struct xfs_scrub *sc); bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno, diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 5799e9a94f1f..893c5a6e3ddb 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -242,7 +242,7 @@ xchk_setup_fscounters( return error; } - return xfs_trans_alloc_empty(sc->mp, &sc->tp); + return xchk_trans_alloc_empty(sc); } /* -- cgit v1.2.3 From ebd610fe82c1d2a94c6fe27cd04d5cf911b5e171 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:53 -0800 Subject: xfs: create a helper to count per-device inode block usage Create a helper to compute the number of blocks that a file has allocated from the data realtime volumes. This patch was split out to reduce the size of the upcoming quotacheck patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 16 ++++++++++++++++ fs/xfs/xfs_inode.h | 2 ++ 2 files changed, 18 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 110077ca3d2a..d6635d219527 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3755,3 +3755,19 @@ xfs_ifork_zapped( return false; } } + +/* Compute the number of data and realtime blocks used by a file. */ +void +xfs_inode_count_blocks( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_filblks_t *dblocks, + xfs_filblks_t *rblocks) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + + *rblocks = 0; + if (XFS_IS_REALTIME_INODE(ip)) + xfs_bmap_count_leaves(ifp, rblocks); + *dblocks = ip->i_nblocks - *rblocks; +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 796d11065fe2..7bbdc7009e7d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -623,5 +623,7 @@ int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip) int xfs_inode_reload_unlinked(struct xfs_inode *ip); bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); +void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); #endif /* __XFS_INODE_H__ */ -- cgit v1.2.3 From 5a3ab5849583217090e29299c7bee88b827c12d8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:54 -0800 Subject: xfs: create a sparse load xfarray function Create a new method to load an xfarray element from the xfile, but with a twist. If we've never stored to the array index, zero the caller's buffer. This will facilitate RMWs updates of records in a sparse array without fuss, since the sparse xfarray convention is that uninitialized array elements default to zeroes. This is a separate patch to reduce the size of the upcoming quotacheck patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/xfarray.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index ec643cc9fc14..acb2f94c56c1 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -45,6 +45,25 @@ int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr); int xfarray_store_anywhere(struct xfarray *array, const void *ptr); bool xfarray_element_is_null(struct xfarray *array, const void *ptr); +/* + * Load an array element, but zero the buffer if there's no data because we + * haven't stored to that array element yet. + */ +static inline int +xfarray_load_sparse( + struct xfarray *array, + uint64_t idx, + void *rec) +{ + int error = xfarray_load(array, idx, rec); + + if (error == -ENODATA) { + memset(rec, 0, array->obj_size); + return 0; + } + return error; +} + /* Append an element to the array. */ static inline int xfarray_append(struct xfarray *array, const void *ptr) { -- cgit v1.2.3 From 48dd9117a34fe9a34a6be0b1dba5694e0f19cbd4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:54 -0800 Subject: xfs: implement live quotacheck inode scan Create a new trio of scrub functions to check quota counters. While the dquots themselves are filesystem metadata and should be checked early, the dquot counter values are computed from other metadata and are therefore summary counters. We don't plug these into the scrub dispatch just yet, because we still need to be able to watch quota updates while doing our scan. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_fs.h | 3 +- fs/xfs/scrub/common.c | 36 +++- fs/xfs/scrub/common.h | 10 + fs/xfs/scrub/health.c | 1 + fs/xfs/scrub/quotacheck.c | 517 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/quotacheck.h | 67 ++++++ fs/xfs/scrub/scrub.c | 6 + fs/xfs/scrub/scrub.h | 6 + fs/xfs/scrub/stats.c | 1 + fs/xfs/scrub/trace.h | 28 ++- 11 files changed, 672 insertions(+), 4 deletions(-) create mode 100644 fs/xfs/scrub/quotacheck.c create mode 100644 fs/xfs/scrub/quotacheck.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 4597c0f19e8e..951404c3e05c 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -180,6 +180,7 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ dqiterate.o \ quota.o \ + quotacheck.o \ ) # online repair diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 711e0fc7efab..07acbed9235c 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -710,9 +710,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_GQUOTA 22 /* group quotas */ #define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ #define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ +#define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 25 +#define XFS_SCRUB_TYPE_NR 26 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 47557f2a7995..57c308b76651 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -29,6 +29,7 @@ #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" +#include "xfs_error.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -82,6 +83,15 @@ __xchk_process_error( sc->ip ? sc->ip : XFS_I(file_inode(sc->file)), sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -89,8 +99,7 @@ __xchk_process_error( *error = 0; fallthrough; default: - trace_xchk_op_error(sc, agno, bno, *error, - ret_ip); + trace_xchk_op_error(sc, agno, bno, *error, ret_ip); break; } return false; @@ -136,6 +145,16 @@ __xchk_fblock_process_error( /* Used to restart an op with deadlock avoidance. */ trace_xchk_deadlock_retry(sc->ip, sc->sm, *error); break; + case -ECANCELED: + /* + * ECANCELED here means that the caller set one of the scrub + * outcome flags (corrupt, xfail, xcorrupt) and wants to exit + * quickly. Set error to zero and do not continue. + */ + trace_xchk_file_op_error(sc, whichfork, offset, *error, + ret_ip); + *error = 0; + break; case -EFSBADCRC: case -EFSCORRUPTED: /* Note the badness but don't abort. */ @@ -227,6 +246,19 @@ xchk_block_set_corrupt( trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } +#ifdef CONFIG_XFS_QUOTA +/* Record a corrupt quota counter. */ +void +xchk_qcheck_set_corrupt( + struct xfs_scrub *sc, + unsigned int dqtype, + xfs_dqid_t id) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_qcheck_error(sc, dqtype, id, __return_address); +} +#endif + /* Record a corruption while cross-referencing. */ void xchk_block_xref_set_corrupt( diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index d2ca423b02c7..eb51037cd0d2 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -55,6 +55,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc, void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork, xfs_fileoff_t offset); +#ifdef CONFIG_XFS_QUOTA +void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype, + xfs_dqid_t id); +#endif void xchk_block_xref_set_corrupt(struct xfs_scrub *sc, struct xfs_buf *bp); @@ -106,6 +110,7 @@ xchk_setup_rtsummary(struct xfs_scrub *sc) #ifdef CONFIG_XFS_QUOTA int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); +int xchk_setup_quotacheck(struct xfs_scrub *sc); #else static inline int xchk_ino_dqattach(struct xfs_scrub *sc) @@ -117,6 +122,11 @@ xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; } +static inline int +xchk_setup_quotacheck(struct xfs_scrub *sc) +{ + return -ENOENT; +} #endif int xchk_setup_fscounters(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 531006910ca9..3c9eac070796 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -105,6 +105,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c new file mode 100644 index 000000000000..193255c69dd5 --- /dev/null +++ b/fs/xfs/scrub/quotacheck.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck + * =============== + * + * Quota counters are "summary" metadata, in the sense that they are computed + * as the summation of the block usage counts for every file on the filesystem. + * Therefore, we compute the correct icount, bcount, and rtbcount values by + * creating a shadow quota counter structure and walking every inode. + */ + +/* Set us up to scrub quota counters. */ +int +xchk_setup_quotacheck( + struct xfs_scrub *sc) +{ + /* Not ready for general consumption yet. */ + return -EOPNOTSUPP; + + if (!XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; + + sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting dquot resource usage counts. For each xfs_dquot attached + * to each inode, we create a shadow dquot, and compute the inode count and add + * the data/rt block usage from what we see. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. + */ + +/* Update an incore dquot counter information from a live update. */ +static int +xqcheck_update_incore_counts( + struct xqcheck *xqc, + struct xfarray *counts, + xfs_dqid_t id, + int64_t inodes, + int64_t nblks, + int64_t rtblks) +{ + struct xqcheck_dquot xcdq; + int error; + + error = xfarray_load_sparse(counts, id, &xcdq); + if (error) + return error; + + xcdq.flags |= XQCHECK_DQUOT_WRITTEN; + xcdq.icount += inodes; + xcdq.bcount += nblks; + xcdq.rtbcount += rtblks; + + error = xfarray_store(counts, id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* Record this inode's quota usage in our shadow quota counter data. */ +STATIC int +xqcheck_collect_inode( + struct xqcheck *xqc, + struct xfs_inode *ip) +{ + struct xfs_trans *tp = xqc->sc->tp; + xfs_filblks_t nblks, rtblks; + uint ilock_flags = 0; + xfs_dqid_t id; + bool isreg = S_ISREG(VFS_I(ip)->i_mode); + int error = 0; + + if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) { + /* + * Quota files are never counted towards quota, so we do not + * need to take the lock. + */ + xchk_iscan_mark_visited(&xqc->iscan, ip); + return 0; + } + + /* Figure out the data / rt device block counts. */ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (isreg) + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Read in the data fork for rt files so that _count_blocks + * can count the number of blocks allocated from the rt volume. + * Inodes do not track that separately. + */ + ilock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); + if (error) + goto out_incomplete; + } else { + ilock_flags = XFS_ILOCK_SHARED; + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); + + /* Update the shadow dquot counters. */ + mutex_lock(&xqc->lock); + if (xqc->ucounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER); + error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->gcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP); + error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + + if (xqc->pcounts) { + id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ); + error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1, + nblks, rtblks); + if (error) + goto out_mutex; + } + mutex_unlock(&xqc->lock); + + xchk_iscan_mark_visited(&xqc->iscan, ip); + goto out_ilock; + +out_mutex: + mutex_unlock(&xqc->lock); +out_incomplete: + xchk_set_incomplete(xqc->sc); +out_ilock: + xfs_iunlock(ip, ilock_flags); + if (isreg) + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return error; +} + +/* Walk all the allocated inodes and run a quota scan on them. */ +STATIC int +xqcheck_collect_counts( + struct xqcheck *xqc) +{ + struct xfs_scrub *sc = xqc->sc; + struct xfs_inode *ip; + int error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done without + * risk of deadlock between sb_internal and the IOLOCK (we take the + * IOLOCK to quiesce the file before scanning) because empty + * transactions do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) { + error = xqcheck_collect_inode(xqc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xqc->iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing dquot resource counters. Walk each xfs_dquot, comparing + * the resource usage counters against our shadow dquots; and then walk each + * shadow dquot (that wasn't covered in the first part), comparing it against + * the xfs_dquot. + */ + +/* + * Check the dquot data against what we observed. Caller must hold the dquot + * lock. + */ +STATIC int +xqcheck_compare_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int error; + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + if (xcdq.icount != dq->q_ino.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.bcount != dq->q_blk.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + if (xcdq.rtbcount != dq->q_rtb.count) + xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id); + + xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * should never happen, since we just read the record. + */ + xchk_set_incomplete(xqc->sc); + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error) + return error; + + if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -ECANCELED; + + return 0; + +out_unlock: + mutex_unlock(&xqc->lock); + return error; +} + +/* + * Walk all the observed dquots, and make sure there's a matching incore + * dquot and that its counts match ours. + */ +STATIC int +xqcheck_walk_observations( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xqcheck_dquot xcdq; + struct xfs_dquot *dq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq); + if (error == -ENOENT) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, id); + return 0; + } + if (error) + return error; + + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + if (xchk_should_terminate(xqc->sc, &error)) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Compare the quota counters we observed against the live dquots. */ +STATIC int +xqcheck_compare_dqtype( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xfs_scrub *sc = xqc->sc; + struct xfs_dquot *dq; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* If the quota CHKD flag is cleared, we need to repair this quota. */ + if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) { + xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0); + return 0; + } + + /* Compare what we observed against the actual dquots. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_compare_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Walk all the observed dquots and compare to the incore ones. */ + return xqcheck_walk_observations(xqc, dqtype); +} + +/* Tear down everything associated with a quotacheck. */ +static void +xqcheck_teardown_scan( + void *priv) +{ + struct xqcheck *xqc = priv; + + if (xqc->pcounts) { + xfarray_destroy(xqc->pcounts); + xqc->pcounts = NULL; + } + + if (xqc->gcounts) { + xfarray_destroy(xqc->gcounts); + xqc->gcounts = NULL; + } + + if (xqc->ucounts) { + xfarray_destroy(xqc->ucounts); + xqc->ucounts = NULL; + } + + xchk_iscan_teardown(&xqc->iscan); + mutex_destroy(&xqc->lock); + xqc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate quota counter data. + * If the scan is successful, the quota data will be left alive for a repair. + * If any error occurs, we'll tear everything down. + */ +STATIC int +xqcheck_setup_scan( + struct xfs_scrub *sc, + struct xqcheck *xqc) +{ + char *descr; + unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL; + int error; + + ASSERT(xqc->sc == NULL); + xqc->sc = sc; + + mutex_init(&xqc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xqc->iscan); + + error = -ENOMEM; + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) { + descr = xchk_xfile_descr(sc, "user dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->ucounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) { + descr = xchk_xfile_descr(sc, "group dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->gcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) { + descr = xchk_xfile_descr(sc, "project dquot records"); + error = xfarray_create(descr, max_dquots, + sizeof(struct xqcheck_dquot), &xqc->pcounts); + kfree(descr); + if (error) + goto out_teardown; + } + + /* Use deferred cleanup to pass the quota count data to repair. */ + sc->buf_cleanup = xqcheck_teardown_scan; + return 0; + +out_teardown: + xqcheck_teardown_scan(xqc); + return error; +} + +/* Scrub all counters for a given quota type. */ +int +xchk_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + int error = 0; + + /* Check quota counters on the live filesystem. */ + error = xqcheck_setup_scan(sc, xqc); + if (error) + return error; + + /* Walk all inodes, picking up quota information. */ + error = xqcheck_collect_counts(xqc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare quota counters. */ + if (xqc->ucounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->gcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + if (xqc->pcounts) { + error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + } + + return 0; +} diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h new file mode 100644 index 000000000000..99eae596dd41 --- /dev/null +++ b/fs/xfs/scrub/quotacheck.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_QUOTACHECK_H__ +#define __XFS_SCRUB_QUOTACHECK_H__ + +/* Quota counters for live quotacheck. */ +struct xqcheck_dquot { + /* block usage count */ + int64_t bcount; + + /* inode usage count */ + int64_t icount; + + /* realtime block usage count */ + int64_t rtbcount; + + /* Record state */ + unsigned int flags; +}; + +/* + * This incore dquot record has been written at least once. We never want to + * store an xqcheck_dquot that looks uninitialized. + */ +#define XQCHECK_DQUOT_WRITTEN (1U << 0) + +/* Already checked this dquot. */ +#define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1) + +/* Live quotacheck control structure. */ +struct xqcheck { + struct xfs_scrub *sc; + + /* Shadow dquot counter data. */ + struct xfarray *ucounts; + struct xfarray *gcounts; + struct xfarray *pcounts; + + /* Lock protecting quotacheck count observations */ + struct mutex lock; + + struct xchk_iscan iscan; +}; + +/* Return the incore counter array for a given quota type. */ +static inline struct xfarray * +xqcheck_counters_for( + struct xqcheck *xqc, + xfs_dqtype_t dqtype) +{ + switch (dqtype) { + case XFS_DQTYPE_USER: + return xqc->ucounts; + case XFS_DQTYPE_GROUP: + return xqc->gcounts; + case XFS_DQTYPE_PROJ: + return xqc->pcounts; + } + + ASSERT(0); + return NULL; +} + +#endif /* __XFS_SCRUB_QUOTACHECK_H__ */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index caf324c2b991..d9fcf992d589 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -360,6 +360,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_fscounters, .repair = xrep_notsupported, }, + [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ + .type = ST_FS, + .setup = xchk_setup_quotacheck, + .scrub = xchk_quotacheck, + .repair = xrep_notsupported, + }, }; static int diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 7fc50654c4fe..779f37b1cb1a 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -167,12 +167,18 @@ xchk_rtsummary(struct xfs_scrub *sc) #endif #ifdef CONFIG_XFS_QUOTA int xchk_quota(struct xfs_scrub *sc); +int xchk_quotacheck(struct xfs_scrub *sc); #else static inline int xchk_quota(struct xfs_scrub *sc) { return -ENOENT; } +static inline int +xchk_quotacheck(struct xfs_scrub *sc) +{ + return -ENOENT; +} #endif int xchk_fscounters(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index cd91db4a5548..d716a432227b 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -77,6 +77,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_GQUOTA] = "grpquota", [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", + [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 9aba60c61880..95e399bc46be 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -15,6 +15,7 @@ #include #include "xfs_bit.h" +#include "xfs_quota_defs.h" struct xfs_scrub; struct xfile; @@ -65,6 +66,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -91,7 +93,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ - { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ + { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -397,6 +400,29 @@ DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); + +TRACE_EVENT(xchk_qcheck_error, + TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id, + void *ret_ip), + TP_ARGS(sc, dqtype, id, ret_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_dqid_t, id) + __field(void *, ret_ip) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->dqtype = dqtype; + __entry->id = id; + __entry->ret_ip = ret_ip; + ), + TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->id, + __entry->ret_ip) +); #endif /* CONFIG_XFS_QUOTA */ TRACE_EVENT(xchk_incomplete, -- cgit v1.2.3 From 200491875ce144fdc49827387fff2f604b73c0a9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:55 -0800 Subject: xfs: track quota updates during live quotacheck Create a shadow dqtrx system in the quotacheck code that hooks the regular dquot counter update code. This will be the means to keep our copy of the dquot counters up to date while the scan runs in real time. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/common.c | 4 + fs/xfs/scrub/quotacheck.c | 358 +++++++++++++++++++++++++++++++++++++++++++++- fs/xfs/scrub/quotacheck.h | 6 + fs/xfs/scrub/scrub.c | 3 + fs/xfs/scrub/scrub.h | 4 +- fs/xfs/scrub/trace.h | 1 + fs/xfs/xfs_qm.c | 16 ++- fs/xfs/xfs_qm.h | 16 +++ fs/xfs/xfs_qm_bhv.c | 1 + fs/xfs/xfs_quota.h | 46 ++++++ fs/xfs/xfs_trans_dquot.c | 167 ++++++++++++++++++++- 11 files changed, 606 insertions(+), 16 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 57c308b76651..c5a6c47d3df2 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -30,6 +30,7 @@ #include "xfs_reflink.h" #include "xfs_ag.h" #include "xfs_error.h" +#include "xfs_quota.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -1298,6 +1299,9 @@ xchk_fsgates_enable( if (scrub_fsgates & XCHK_FSGATES_DRAIN) xfs_drain_wait_enable(); + if (scrub_fsgates & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_enable(); + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index 193255c69dd5..2cd55f2a0c05 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -38,17 +38,54 @@ * creating a shadow quota counter structure and walking every inode. */ +/* Track the quota deltas for a dquot in a transaction. */ +struct xqcheck_dqtrx { + xfs_dqtype_t q_type; + xfs_dqid_t q_id; + + int64_t icount_delta; + + int64_t bcount_delta; + int64_t delbcnt_delta; + + int64_t rtbcount_delta; + int64_t delrtb_delta; +}; + +#define XQCHECK_MAX_NR_DQTRXS (XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS) + +/* + * Track the quota deltas for all dquots attached to a transaction if the + * quota deltas are being applied to an inode that we already scanned. + */ +struct xqcheck_dqacct { + struct rhash_head hash; + uintptr_t tx_id; + struct xqcheck_dqtrx dqtrx[XQCHECK_MAX_NR_DQTRXS]; + unsigned int refcount; +}; + +/* Free a shadow dquot accounting structure. */ +static void +xqcheck_dqacct_free( + void *ptr, + void *arg) +{ + struct xqcheck_dqacct *dqa = ptr; + + kfree(dqa); +} + /* Set us up to scrub quota counters. */ int xchk_setup_quotacheck( struct xfs_scrub *sc) { - /* Not ready for general consumption yet. */ - return -EOPNOTSUPP; - if (!XFS_IS_QUOTA_ON(sc->mp)) return -ENOENT; + xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA); + sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; @@ -66,6 +103,22 @@ xchk_setup_quotacheck( * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by * xchk_*_process_error. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the quota counters for an inode that we've already + * scanned. This will cause our counts to be incorrect. Therefore, we hook + * the live transaction code in two places: (1) when the callers update the + * per-transaction dqtrx structure to log quota counter updates; and (2) when + * transaction commit actually logs those updates to the incore dquot. By + * shadowing transaction updates in this manner, live quotacheck can ensure + * by locking the dquot and the shadow structure that its own copies are not + * out of date. Because the hook code runs in a different process context from + * the scrub code and the scrub state flags are not accessed atomically, + * failures in the hook code must abort the iscan and the scrubber must notice + * the aborted scan and set the incomplete flag. + * + * Note that we use srcu notifier hooks to minimize the overhead when live + * quotacheck is /not/ running. */ /* Update an incore dquot counter information from a live update. */ @@ -102,6 +155,234 @@ xqcheck_update_incore_counts( return error; } +/* Decide if this is the shadow dquot accounting structure for a transaction. */ +static int +xqcheck_dqacct_obj_cmpfn( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const uintptr_t *tx_idp = arg->key; + const struct xqcheck_dqacct *dqa = obj; + + if (dqa->tx_id != *tx_idp) + return 1; + return 0; +} + +static const struct rhashtable_params xqcheck_dqacct_hash_params = { + .min_size = 32, + .key_len = sizeof(uintptr_t), + .key_offset = offsetof(struct xqcheck_dqacct, tx_id), + .head_offset = offsetof(struct xqcheck_dqacct, hash), + .automatic_shrinking = true, + .obj_cmpfn = xqcheck_dqacct_obj_cmpfn, +}; + +/* Find a shadow dqtrx slot for the given dquot. */ +STATIC struct xqcheck_dqtrx * +xqcheck_get_dqtrx( + struct xqcheck_dqacct *dqa, + xfs_dqtype_t q_type, + xfs_dqid_t q_id) +{ + int i; + + for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) { + if (dqa->dqtrx[i].q_type == 0 || + (dqa->dqtrx[i].q_type == q_type && + dqa->dqtrx[i].q_id == q_id)) + return &dqa->dqtrx[i]; + } + + return NULL; +} + +/* + * Create and fill out a quota delta tracking structure to shadow the updates + * going on in the regular quota code. + */ +static int +xqcheck_mod_live_ino_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_mod_ino_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb); + + /* Skip quota reservation fields. */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + case XFS_TRANS_DQ_DELBCOUNT: + case XFS_TRANS_DQ_ICOUNT: + case XFS_TRANS_DQ_RTBCOUNT: + case XFS_TRANS_DQ_DELRTBCOUNT: + break; + default: + return NOTIFY_DONE; + } + + /* Ignore dqtrx updates for quota types we don't care about. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + if (!xqc->ucounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_GROUP: + if (!xqc->gcounts) + return NOTIFY_DONE; + break; + case XFS_DQTYPE_PROJ: + if (!xqc->pcounts) + return NOTIFY_DONE; + break; + default: + return NOTIFY_DONE; + } + + /* Skip inodes that haven't been scanned yet. */ + if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino)) + return NOTIFY_DONE; + + /* Make a shadow quota accounting tracker for this transaction. */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) { + dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS); + if (!dqa) + goto out_abort; + + dqa->tx_id = p->tx_id; + error = rhashtable_insert_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + } + + /* Find the shadow dqtrx (or an empty slot) here. */ + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx) + goto out_abort; + if (dqtrx->q_type == 0) { + dqtrx->q_type = p->q_type; + dqtrx->q_id = p->q_id; + dqa->refcount++; + } + + /* Update counter */ + switch (action) { + case XFS_TRANS_DQ_BCOUNT: + dqtrx->bcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELBCOUNT: + dqtrx->delbcnt_delta += p->delta; + break; + case XFS_TRANS_DQ_ICOUNT: + dqtrx->icount_delta += p->delta; + break; + case XFS_TRANS_DQ_RTBCOUNT: + dqtrx->rtbcount_delta += p->delta; + break; + case XFS_TRANS_DQ_DELRTBCOUNT: + dqtrx->delrtb_delta += p->delta; + break; + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + +/* + * Apply the transaction quota deltas to our shadow quota accounting info when + * the regular quota code are doing the same. + */ +static int +xqcheck_apply_live_dqtrx( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_apply_dqtrx_params *p = data; + struct xqcheck *xqc; + struct xqcheck_dqacct *dqa; + struct xqcheck_dqtrx *dqtrx; + struct xfarray *counts; + int error; + + xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb); + + /* Map the dquot type to an incore counter object. */ + switch (p->q_type) { + case XFS_DQTYPE_USER: + counts = xqc->ucounts; + break; + case XFS_DQTYPE_GROUP: + counts = xqc->gcounts; + break; + case XFS_DQTYPE_PROJ: + counts = xqc->pcounts; + break; + default: + return NOTIFY_DONE; + } + + if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL) + return NOTIFY_DONE; + + /* + * Find the shadow dqtrx for this transaction and dquot, if any deltas + * need to be applied here. If not, we're finished early. + */ + mutex_lock(&xqc->lock); + dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id, + xqcheck_dqacct_hash_params); + if (!dqa) + goto out_unlock; + dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id); + if (!dqtrx || dqtrx->q_type == 0) + goto out_unlock; + + /* Update our shadow dquot if we're committing. */ + if (action == XFS_APPLY_DQTRX_COMMIT) { + error = xqcheck_update_incore_counts(xqc, counts, p->q_id, + dqtrx->icount_delta, + dqtrx->bcount_delta + dqtrx->delbcnt_delta, + dqtrx->rtbcount_delta + dqtrx->delrtb_delta); + if (error) + goto out_abort; + } + + /* Free the shadow accounting structure if that was the last user. */ + dqa->refcount--; + if (dqa->refcount == 0) { + error = rhashtable_remove_fast(&xqc->shadow_dquot_acct, + &dqa->hash, xqcheck_dqacct_hash_params); + if (error) + goto out_abort; + xqcheck_dqacct_free(dqa, NULL); + } + + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xqc->iscan); +out_unlock: + mutex_unlock(&xqc->lock); + return NOTIFY_DONE; +} + /* Record this inode's quota usage in our shadow quota counter data. */ STATIC int xqcheck_collect_inode( @@ -137,13 +418,18 @@ xqcheck_collect_inode( ilock_flags = xfs_ilock_data_map_shared(ip); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) - goto out_incomplete; + goto out_abort; } else { ilock_flags = XFS_ILOCK_SHARED; xfs_ilock(ip, XFS_ILOCK_SHARED); } xfs_inode_count_blocks(tp, ip, &nblks, &rtblks); + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + /* Update the shadow dquot counters. */ mutex_lock(&xqc->lock); if (xqc->ucounts) { @@ -176,6 +462,8 @@ xqcheck_collect_inode( out_mutex: mutex_unlock(&xqc->lock); +out_abort: + xchk_iscan_abort(&xqc->iscan); out_incomplete: xchk_set_incomplete(xqc->sc); out_ilock: @@ -267,6 +555,11 @@ xqcheck_compare_dquot( struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); int error; + if (xchk_iscan_aborted(&xqc->iscan)) { + xchk_set_incomplete(xqc->sc); + return -ECANCELED; + } + mutex_lock(&xqc->lock); error = xfarray_load_sparse(counts, dq->q_id, &xcdq); if (error) @@ -288,7 +581,7 @@ xqcheck_compare_dquot( * EFBIG means we tried to store data at too high a byte offset * in the sparse array. IOWs, we cannot complete the check and * must notify userspace that the check was incomplete. This - * should never happen, since we just read the record. + * should never happen outside of the collection phase. */ xchk_set_incomplete(xqc->sc); error = -ECANCELED; @@ -395,6 +688,26 @@ xqcheck_teardown_scan( void *priv) { struct xqcheck *xqc = priv; + struct xfs_quotainfo *qi = xqc->sc->mp->m_quotainfo; + + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xqc->iscan); + + /* + * As noted above, the apply hook is responsible for cleaning up the + * shadow dquot accounting data when a transaction completes. The mod + * hook must be removed before the apply hook so that we don't + * mistakenly leave an active shadow account for the mod hook to get + * its hands on. No hooks should be running after these functions + * return. + */ + xfs_dqtrx_hook_del(qi, &xqc->qhook); + + if (xqc->shadow_dquot_acct.key_len) { + rhashtable_free_and_destroy(&xqc->shadow_dquot_acct, + xqcheck_dqacct_free, NULL); + xqc->shadow_dquot_acct.key_len = 0; + } if (xqc->pcounts) { xfarray_destroy(xqc->pcounts); @@ -427,6 +740,7 @@ xqcheck_setup_scan( struct xqcheck *xqc) { char *descr; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; unsigned long long max_dquots = XFS_DQ_ID_MAX + 1ULL; int error; @@ -466,6 +780,33 @@ xqcheck_setup_scan( goto out_teardown; } + /* + * Set up hash table to map transactions to our internal shadow dqtrx + * structures. + */ + error = rhashtable_init(&xqc->shadow_dquot_acct, + &xqcheck_dqacct_hash_params); + if (error) + goto out_teardown; + + /* + * Hook into the quota code. The hook only triggers for inodes that + * were already scanned, and the scanner thread takes each inode's + * ILOCK, which means that any in-progress inode updates will finish + * before we can scan the inode. + * + * The apply hook (which removes the shadow dquot accounting struct) + * must be installed before the mod hook so that we never fail to catch + * the end of a quota update sequence and leave stale shadow data. + */ + ASSERT(sc->flags & XCHK_FSGATES_QUOTA); + xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx, + xqcheck_apply_live_dqtrx); + + error = xfs_dqtrx_hook_add(qi, &xqc->qhook); + if (error) + goto out_teardown; + /* Use deferred cleanup to pass the quota count data to repair. */ sc->buf_cleanup = xqcheck_teardown_scan; return 0; @@ -493,6 +834,9 @@ xchk_quotacheck( if (!xchk_xref_process_error(sc, 0, 0, &error)) return error; + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; @@ -513,5 +857,9 @@ xchk_quotacheck( return error; } + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xqc->iscan)) + xchk_set_incomplete(sc); + return 0; } diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h index 99eae596dd41..244e8cc0f8e9 100644 --- a/fs/xfs/scrub/quotacheck.h +++ b/fs/xfs/scrub/quotacheck.h @@ -43,6 +43,12 @@ struct xqcheck { struct mutex lock; struct xchk_iscan iscan; + + /* Hooks into the quota code. */ + struct xfs_dqtrx_hook qhook; + + /* Shadow quota delta tracking structure. */ + struct rhashtable shadow_dquot_acct; }; /* Return the incore counter array for a given quota type. */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index d9fcf992d589..71a9eb48e1de 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -157,6 +157,9 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_DRAIN) xfs_drain_wait_disable(); + if (sc->flags & XCHK_FSGATES_QUOTA) + xfs_dqtrx_hook_disable(); + sc->flags &= ~XCHK_FSGATES_ALL; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 779f37b1cb1a..5cd4550155f2 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -121,6 +121,7 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ #define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ @@ -130,7 +131,8 @@ struct xfs_scrub { * features are gated off via dynamic code patching, which is why the state * must be enabled during scrub setup and can only be torn down afterwards. */ -#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN) +#define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \ + XCHK_FSGATES_QUOTA) /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 95e399bc46be..6c90bc7a316b 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -112,6 +112,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index dca2f60b2e30..991be1eacb6c 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -693,6 +693,9 @@ xfs_qm_init_quotainfo( shrinker_register(qinf->qi_shrinker); + xfs_hooks_init(&qinf->qi_mod_ino_dqtrx_hooks); + xfs_hooks_init(&qinf->qi_apply_dqtrx_hooks); + return 0; out_free_inos: @@ -1824,12 +1827,12 @@ xfs_qm_vop_chown( ASSERT(prevdq); ASSERT(prevdq != newdq); - xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks)); - xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); + xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks)); + xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1); /* the sparkling new dquot */ - xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks); - xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks); + xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1); /* * Back when we made quota reservations for the chown, we reserved the @@ -1911,22 +1914,21 @@ xfs_qm_vop_create_dqattach( ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id); ip->i_udquot = xfs_qm_dqhold(udqp); - xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); } if (gdqp && XFS_IS_GQUOTA_ON(mp)) { ASSERT(ip->i_gdquot == NULL); ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id); ip->i_gdquot = xfs_qm_dqhold(gdqp); - xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); } if (pdqp && XFS_IS_PQUOTA_ON(mp)) { ASSERT(ip->i_pdquot == NULL); ASSERT(ip->i_projid == pdqp->q_id); ip->i_pdquot = xfs_qm_dqhold(pdqp); - xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); } + + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, 1); } /* Decide if this inode's dquot is near an enforcement boundary. */ diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index d5c9fc4ba591..f5993012bf98 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -68,6 +68,10 @@ struct xfs_quotainfo { /* Minimum and maximum quota expiration timestamp values. */ time64_t qi_expiry_min; time64_t qi_expiry_max; + + /* Hook to feed quota counter updates to an active online repair. */ + struct xfs_hooks qi_mod_ino_dqtrx_hooks; + struct xfs_hooks qi_apply_dqtrx_hooks; }; static inline struct radix_tree_root * @@ -104,6 +108,18 @@ xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type) return NULL; } +/* + * Parameters for tracking dqtrx changes on behalf of an inode. The hook + * function arg parameter is the field being updated. + */ +struct xfs_mod_ino_dqtrx_params { + uintptr_t tx_id; + xfs_ino_t ino; + xfs_dqtype_t q_type; + xfs_dqid_t q_id; + int64_t delta; +}; + extern void xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp, uint field, int64_t delta); extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index b77673dd0558..271c1021c733 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -9,6 +9,7 @@ #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_quota.h" #include "xfs_mount.h" #include "xfs_inode.h" diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index e0d56489f3b2..85a4ae1a17f6 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -74,6 +74,22 @@ struct xfs_dqtrx { int64_t qt_icount_delta; /* dquot inode count changes */ }; +enum xfs_apply_dqtrx_type { + XFS_APPLY_DQTRX_COMMIT = 0, + XFS_APPLY_DQTRX_UNRESERVE, +}; + +/* + * Parameters for applying dqtrx changes to a dquot. The hook function arg + * parameter is enum xfs_apply_dqtrx_type. + */ +struct xfs_apply_dqtrx_params { + uintptr_t tx_id; + xfs_ino_t ino; + xfs_dqtype_t q_type; + xfs_dqid_t q_id; +}; + #ifdef CONFIG_XFS_QUOTA extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); extern void xfs_trans_free_dqinfo(struct xfs_trans *); @@ -114,6 +130,30 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); } bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); + +# ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip, + struct xfs_dquot *dqp, unsigned int field, int64_t delta); + +struct xfs_quotainfo; + +struct xfs_dqtrx_hook { + struct xfs_hook mod_hook; + struct xfs_hook apply_hook; +}; + +void xfs_dqtrx_hook_disable(void); +void xfs_dqtrx_hook_enable(void); + +int xfs_dqtrx_hook_add(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook); +void xfs_dqtrx_hook_del(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook); +void xfs_dqtrx_hook_setup(struct xfs_dqtrx_hook *hook, notifier_fn_t mod_fn, + notifier_fn_t apply_fn); +# else +# define xfs_trans_mod_ino_dquot(tp, ip, dqp, field, delta) \ + xfs_trans_mod_dquot((tp), (dqp), (field), (delta)) +# endif /* CONFIG_XFS_LIVE_HOOKS */ + #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -173,6 +213,12 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) #define xfs_inode_near_dquot_enforcement(ip, type) (false) + +# ifdef CONFIG_XFS_LIVE_HOOKS +# define xfs_dqtrx_hook_enable() ((void)0) +# define xfs_dqtrx_hook_disable() ((void)0) +# endif /* CONFIG_XFS_LIVE_HOOKS */ + #endif /* CONFIG_XFS_QUOTA */ static inline int diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 30bde11aa1c2..577b535a595c 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -121,6 +121,116 @@ xfs_trans_dup_dqinfo( } } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of quota live updates. If the + * compiler supports jump labels, the static branch will be replaced by a nop + * sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dqtrx_hooks_switch); + +void +xfs_dqtrx_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dqtrx_hooks_switch); +} + +void +xfs_dqtrx_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dqtrx_hooks_switch); +} + +/* Schedule a transactional dquot update on behalf of an inode. */ +void +xfs_trans_mod_ino_dquot( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *dqp, + unsigned int field, + int64_t delta) +{ + xfs_trans_mod_dquot(tp, dqp, field, delta); + + if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { + struct xfs_mod_ino_dqtrx_params p = { + .tx_id = (uintptr_t)tp, + .ino = ip->i_ino, + .q_type = xfs_dquot_type(dqp), + .q_id = dqp->q_id, + .delta = delta + }; + struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo; + + xfs_hooks_call(&qi->qi_mod_ino_dqtrx_hooks, field, &p); + } +} + +/* Call the specified functions during a dquot counter update. */ +int +xfs_dqtrx_hook_add( + struct xfs_quotainfo *qi, + struct xfs_dqtrx_hook *hook) +{ + int error; + + /* + * Transactional dquot updates first call the mod hook when changes + * are attached to the transaction and then call the apply hook when + * those changes are committed (or canceled). + * + * The apply hook must be installed before the mod hook so that we + * never fail to catch the end of a quota update sequence. + */ + error = xfs_hooks_add(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook); + if (error) + goto out; + + error = xfs_hooks_add(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook); + if (error) + goto out_apply; + + return 0; + +out_apply: + xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook); +out: + return error; +} + +/* Stop calling the specified function during a dquot counter update. */ +void +xfs_dqtrx_hook_del( + struct xfs_quotainfo *qi, + struct xfs_dqtrx_hook *hook) +{ + /* + * The mod hook must be removed before apply hook to avoid giving the + * hook consumer with an incomplete update. No hooks should be running + * after these functions return. + */ + xfs_hooks_del(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook); + xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook); +} + +/* Configure dquot update hook functions. */ +void +xfs_dqtrx_hook_setup( + struct xfs_dqtrx_hook *hook, + notifier_fn_t mod_fn, + notifier_fn_t apply_fn) +{ + xfs_hook_setup(&hook->mod_hook, mod_fn); + xfs_hook_setup(&hook->apply_hook, apply_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Wrap around mod_dquot to account for both user and group quotas. */ @@ -138,11 +248,11 @@ xfs_trans_mod_dquot_byino( return; if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) - (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); + xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta); if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) - (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); + xfs_trans_mod_ino_dquot(tp, ip, ip->i_gdquot, field, delta); if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot) - (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta); + xfs_trans_mod_ino_dquot(tp, ip, ip->i_pdquot, field, delta); } STATIC struct xfs_dqtrx * @@ -322,6 +432,29 @@ xfs_apply_quota_reservation_deltas( } } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* Call downstream hooks now that it's time to apply dquot deltas. */ +static inline void +xfs_trans_apply_dquot_deltas_hook( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { + struct xfs_apply_dqtrx_params p = { + .tx_id = (uintptr_t)tp, + .q_type = xfs_dquot_type(dqp), + .q_id = dqp->q_id, + }; + struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo; + + xfs_hooks_call(&qi->qi_apply_dqtrx_hooks, + XFS_APPLY_DQTRX_COMMIT, &p); + } +} +#else +# define xfs_trans_apply_dquot_deltas_hook(tp, dqp) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Called by xfs_trans_commit() and similar in spirit to * xfs_trans_apply_sb_deltas(). @@ -367,6 +500,8 @@ xfs_trans_apply_dquot_deltas( ASSERT(XFS_DQ_IS_LOCKED(dqp)); + xfs_trans_apply_dquot_deltas_hook(tp, dqp); + /* * adjust the actual number of blocks used */ @@ -466,6 +601,29 @@ xfs_trans_apply_dquot_deltas( } } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* Call downstream hooks now that it's time to cancel dquot deltas. */ +static inline void +xfs_trans_unreserve_and_mod_dquots_hook( + struct xfs_trans *tp, + struct xfs_dquot *dqp) +{ + if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) { + struct xfs_apply_dqtrx_params p = { + .tx_id = (uintptr_t)tp, + .q_type = xfs_dquot_type(dqp), + .q_id = dqp->q_id, + }; + struct xfs_quotainfo *qi = tp->t_mountp->m_quotainfo; + + xfs_hooks_call(&qi->qi_apply_dqtrx_hooks, + XFS_APPLY_DQTRX_UNRESERVE, &p); + } +} +#else +# define xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Release the reservations, and adjust the dquots accordingly. * This is called only when the transaction is being aborted. If by @@ -496,6 +654,9 @@ xfs_trans_unreserve_and_mod_dquots( */ if ((dqp = qtrx->qt_dquot) == NULL) break; + + xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp); + /* * Unreserve the original reservation. We don't care * about the number of blocks used field, or deltas. -- cgit v1.2.3 From 7038c6e5261eef2442bff086f80a493dbd8bfdc9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:56 -0800 Subject: xfs: repair cannot update the summary counters when logging quota flags While running xfs/804 (quota repairs racing with fsstress), I observed a filesystem shutdown in the primary sb write verifier: run fstests xfs/804 at 2022-05-23 18:43:48 XFS (sda4): Mounting V5 Filesystem XFS (sda4): Ending clean mount XFS (sda4): Quotacheck needed: Please wait. XFS (sda4): Quotacheck: Done. XFS (sda4): EXPERIMENTAL online scrub feature in use. Use at your own risk! XFS (sda4): SB ifree sanity check failed 0xb5 > 0x80 XFS (sda4): Metadata corruption detected at xfs_sb_write_verify+0x5e/0x100 [xfs], xfs_sb block 0x0 XFS (sda4): Unmount and run xfs_repair The "SB ifree sanity check failed" message was a debugging printk that I added to the kernel; observe that 0xb5 - 0x80 = 53, which is less than one inode chunk. I traced this to the xfs_log_sb calls from the online quota repair code, which tries to clear the CHKD flags from the superblock to force a mount-time quotacheck if the repair fails. On a V5 filesystem, xfs_log_sb updates the ondisk sb summary counters with the current contents of the percpu counters. This is done without quiescing other writer threads, which means it could be racing with a thread that has updated icount and is about to update ifree. If the other write thread had incremented ifree before updating icount, the repair thread will write icount > ifree into the logged update. If the AIL writes the logged superblock back to disk before anyone else fixes this siutation, this will lead to a write verifier failure, which causes a filesystem shutdown. Resolve this problem by updating the quota flags and calling xfs_sb_to_disk directly, which does not touch the percpu counters. While we're at it, we can elide the entire update if the selected qflags aren't set. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/repair.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 745d5b8f405a..3d2c4dbb6909 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -687,6 +687,39 @@ xrep_find_ag_btree_roots( } #ifdef CONFIG_XFS_QUOTA +/* Update some quota flags in the superblock. */ +static void +xrep_update_qflags( + struct xfs_scrub *sc, + unsigned int clear_flags) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + if ((mp->m_qflags & clear_flags) == 0) + goto no_update; + + mp->m_qflags &= ~clear_flags; + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags &= ~clear_flags; + spin_unlock(&mp->m_sb_lock); + + /* + * Update the quota flags in the ondisk superblock without touching + * the summary counters. We have not quiesced inode chunk allocation, + * so we cannot coordinate with updates to the icount and ifree percpu + * counters. + */ + bp = xfs_trans_getsb(sc->tp); + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1); + +no_update: + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); +} + /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -699,13 +732,7 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; - mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); - sc->mp->m_qflags &= ~flag; - spin_lock(&sc->mp->m_sb_lock); - sc->mp->m_sb.sb_qflags &= ~flag; - spin_unlock(&sc->mp->m_sb_lock); - xfs_log_sb(sc->tp); - mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + xrep_update_qflags(sc, flag); } /* -- cgit v1.2.3 From 96ed2ae4a9b06b417e1c20c086c77755a43284bf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:57 -0800 Subject: xfs: repair dquots based on live quotacheck results Use the shadow quota counters that live quotacheck creates to reset the incore dquot counters. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/quotacheck.c | 4 +- fs/xfs/scrub/quotacheck.h | 3 + fs/xfs/scrub/quotacheck_repair.c | 261 +++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.c | 13 +- fs/xfs/scrub/repair.h | 5 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 1 + 8 files changed, 284 insertions(+), 6 deletions(-) create mode 100644 fs/xfs/scrub/quotacheck_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 951404c3e05c..68891e6ee08e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -204,6 +204,7 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ quota_repair.o \ + quotacheck_repair.o \ ) endif endif diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c index 2cd55f2a0c05..c77eb2de8df7 100644 --- a/fs/xfs/scrub/quotacheck.c +++ b/fs/xfs/scrub/quotacheck.c @@ -102,7 +102,9 @@ xchk_setup_quotacheck( * set the INCOMPLETE flag even when a negative errno is returned. This care * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by - * xchk_*_process_error. + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. * * Because we are scanning a live filesystem, it's possible that another thread * will try to update the quota counters for an inode that we've already diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h index 244e8cc0f8e9..4ea5f249c978 100644 --- a/fs/xfs/scrub/quotacheck.h +++ b/fs/xfs/scrub/quotacheck.h @@ -30,6 +30,9 @@ struct xqcheck_dquot { /* Already checked this dquot. */ #define XQCHECK_DQUOT_COMPARE_SCANNED (1U << 1) +/* Already repaired this dquot. */ +#define XQCHECK_DQUOT_REPAIR_SCANNED (1U << 2) + /* Live quotacheck control structure. */ struct xqcheck { struct xfs_scrub *sc; diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c new file mode 100644 index 000000000000..dd8554c755b5 --- /dev/null +++ b/fs/xfs/scrub/quotacheck_repair.c @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2020-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/quota.h" +#include "scrub/quotacheck.h" +#include "scrub/trace.h" + +/* + * Live Quotacheck Repair + * ====================== + * + * Use the live quota counter information that we collected to replace the + * counter values in the incore dquots. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * dquot is locked. + */ + +/* Commit new counters to a dquot. */ +static int +xqcheck_commit_dquot( + struct xqcheck *xqc, + xfs_dqtype_t dqtype, + struct xfs_dquot *dq) +{ + struct xqcheck_dquot xcdq; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + int64_t delta; + bool dirty = false; + int error = 0; + + /* Unlock the dquot just long enough to allocate a transaction. */ + xfs_dqunlock(dq); + error = xchk_trans_alloc(xqc->sc, 0); + xfs_dqlock(dq); + if (error) + return error; + + xfs_trans_dqjoin(xqc->sc->tp, dq); + + if (xchk_iscan_aborted(&xqc->iscan)) { + error = -ECANCELED; + goto out_cancel; + } + + mutex_lock(&xqc->lock); + error = xfarray_load_sparse(counts, dq->q_id, &xcdq); + if (error) + goto out_unlock; + + /* Adjust counters as needed. */ + delta = (int64_t)xcdq.icount - dq->q_ino.count; + if (delta) { + dq->q_ino.reserved += delta; + dq->q_ino.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.bcount - dq->q_blk.count; + if (delta) { + dq->q_blk.reserved += delta; + dq->q_blk.count += delta; + dirty = true; + } + + delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count; + if (delta) { + dq->q_rtb.reserved += delta; + dq->q_rtb.count += delta; + dirty = true; + } + + xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN); + error = xfarray_store(counts, dq->q_id, &xcdq); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the repair + * and must cancel the whole operation. This should never + * happen, but we need to catch it anyway. + */ + error = -ECANCELED; + } + mutex_unlock(&xqc->lock); + if (error || !dirty) + goto out_cancel; + + trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id); + + /* Commit the dirty dquot to disk. */ + dq->q_flags |= XFS_DQFLAG_DIRTY; + if (dq->q_id) + xfs_qm_adjust_dqtimers(dq); + xfs_trans_log_dquot(xqc->sc->tp, dq); + + /* + * Transaction commit unlocks the dquot, so we must re-lock it so that + * the caller can put the reference (which apparently requires a locked + * dquot). + */ + error = xrep_trans_commit(xqc->sc); + xfs_dqlock(dq); + return error; + +out_unlock: + mutex_unlock(&xqc->lock); +out_cancel: + xchk_trans_cancel(xqc->sc); + + /* Re-lock the dquot so the caller can put the reference. */ + xfs_dqlock(dq); + return error; +} + +/* Commit new quota counters for a particular quota type. */ +STATIC int +xqcheck_commit_dqtype( + struct xqcheck *xqc, + unsigned int dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xqcheck_dquot xcdq; + struct xfs_scrub *sc = xqc->sc; + struct xfs_mount *mp = sc->mp; + struct xfarray *counts = xqcheck_counters_for(xqc, dqtype); + struct xfs_dquot *dq; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + /* + * Update the counters of every dquot that the quota file knows about. + */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* + * Make a second pass to deal with the dquots that we know about but + * the quota file previously did not know about. + */ + mutex_lock(&xqc->lock); + while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) { + xfs_dqid_t id = cur - 1; + + if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED) + continue; + + mutex_unlock(&xqc->lock); + + /* + * Grab the dquot, allowing for dquot block allocation in a + * separate transaction. We committed the scrub transaction + * in a previous step, so we will not be creating nested + * transactions here. + */ + error = xfs_qm_dqget(mp, id, dqtype, true, &dq); + if (error) + return error; + + error = xqcheck_commit_dquot(xqc, dqtype, dq); + xfs_qm_dqput(dq); + if (error) + return error; + + mutex_lock(&xqc->lock); + } + mutex_unlock(&xqc->lock); + + return error; +} + +/* Figure out quota CHKD flags for the running quota types. */ +static inline unsigned int +xqcheck_chkd_flags( + struct xfs_mount *mp) +{ + unsigned int ret = 0; + + if (XFS_IS_UQUOTA_ON(mp)) + ret |= XFS_UQUOTA_CHKD; + if (XFS_IS_GQUOTA_ON(mp)) + ret |= XFS_GQUOTA_CHKD; + if (XFS_IS_PQUOTA_ON(mp)) + ret |= XFS_PQUOTA_CHKD; + return ret; +} + +/* Commit the new dquot counters. */ +int +xrep_quotacheck( + struct xfs_scrub *sc) +{ + struct xqcheck *xqc = sc->buf; + unsigned int qflags = xqcheck_chkd_flags(sc->mp); + int error; + + /* + * Clear the CHKD flag for the running quota types and commit the scrub + * transaction so that we can allocate new quota block mappings if we + * have to. If we crash after this point, the sb still has the CHKD + * flags cleared, so mount quotacheck will fix all of this up. + */ + xrep_update_qflags(sc, qflags, 0); + error = xrep_trans_commit(sc); + if (error) + return error; + + /* Commit the new counters to the dquots. */ + if (xqc->ucounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER); + if (error) + return error; + } + if (xqc->gcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP); + if (error) + return error; + } + if (xqc->pcounts) { + error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ); + if (error) + return error; + } + + /* Set the CHKD flags now that we've fixed quota counts. */ + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + xrep_update_qflags(sc, 0, qflags); + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 3d2c4dbb6909..7141b1778902 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -688,21 +688,26 @@ xrep_find_ag_btree_roots( #ifdef CONFIG_XFS_QUOTA /* Update some quota flags in the superblock. */ -static void +void xrep_update_qflags( struct xfs_scrub *sc, - unsigned int clear_flags) + unsigned int clear_flags, + unsigned int set_flags) { struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; mutex_lock(&mp->m_quotainfo->qi_quotaofflock); - if ((mp->m_qflags & clear_flags) == 0) + if ((mp->m_qflags & clear_flags) == 0 && + (mp->m_qflags & set_flags) == set_flags) goto no_update; mp->m_qflags &= ~clear_flags; + mp->m_qflags |= set_flags; + spin_lock(&mp->m_sb_lock); mp->m_sb.sb_qflags &= ~clear_flags; + mp->m_sb.sb_qflags |= set_flags; spin_unlock(&mp->m_sb_lock); /* @@ -732,7 +737,7 @@ xrep_force_quotacheck( if (!(flag & sc->mp->m_qflags)) return; - xrep_update_qflags(sc, flag); + xrep_update_qflags(sc, flag, 0); } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 17114327e6fa..fdfa06699921 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -72,6 +72,8 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); #ifdef CONFIG_XFS_QUOTA +void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags, + unsigned int set_flags); void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); #else @@ -123,8 +125,10 @@ int xrep_rtbitmap(struct xfs_scrub *sc); #ifdef CONFIG_XFS_QUOTA int xrep_quota(struct xfs_scrub *sc); +int xrep_quotacheck(struct xfs_scrub *sc); #else # define xrep_quota xrep_notsupported +# define xrep_quotacheck xrep_notsupported #endif /* CONFIG_XFS_QUOTA */ int xrep_reinit_pagf(struct xfs_scrub *sc); @@ -191,6 +195,7 @@ xrep_setup_nothing( #define xrep_bmap_cow xrep_notsupported #define xrep_rtbitmap xrep_notsupported #define xrep_quota xrep_notsupported +#define xrep_quotacheck xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 71a9eb48e1de..9112c0985c62 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -367,7 +367,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_quotacheck, .scrub = xchk_quotacheck, - .repair = xrep_notsupported, + .repair = xrep_quotacheck, }, }; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 6c90bc7a316b..fedcebf90a42 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2004,6 +2004,7 @@ DEFINE_EVENT(xrep_dquot_class, name, \ DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot); #endif /* CONFIG_XFS_QUOTA */ #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ -- cgit v1.2.3 From 93687ee2e3748a4a6b541ff0d83d1480815b00a9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:58 -0800 Subject: xfs: report health of inode link counts Report on the health of the inode link counts. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_fs.h | 1 + fs/xfs/libxfs/xfs_health.h | 4 +++- fs/xfs/xfs_health.c | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 07acbed9235c..f10d0aa0e337 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -196,6 +196,7 @@ struct xfs_fsop_geom { #define XFS_FSOP_GEOM_SICK_RT_BITMAP (1 << 4) /* realtime bitmap */ #define XFS_FSOP_GEOM_SICK_RT_SUMMARY (1 << 5) /* realtime summary */ #define XFS_FSOP_GEOM_SICK_QUOTACHECK (1 << 6) /* quota counts */ +#define XFS_FSOP_GEOM_SICK_NLINKS (1 << 7) /* inode link counts */ /* Output for XFS_FS_COUNTS */ typedef struct xfs_fsop_counts { diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 5626e53b3f0f..2bfe2dc404a1 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -42,6 +42,7 @@ struct xfs_fsop_geom; #define XFS_SICK_FS_GQUOTA (1 << 2) /* group quota */ #define XFS_SICK_FS_PQUOTA (1 << 3) /* project quota */ #define XFS_SICK_FS_QUOTACHECK (1 << 4) /* quota counts */ +#define XFS_SICK_FS_NLINKS (1 << 5) /* inode link counts */ /* Observable health issues for realtime volume metadata. */ #define XFS_SICK_RT_BITMAP (1 << 0) /* realtime bitmap */ @@ -79,7 +80,8 @@ struct xfs_fsop_geom; XFS_SICK_FS_UQUOTA | \ XFS_SICK_FS_GQUOTA | \ XFS_SICK_FS_PQUOTA | \ - XFS_SICK_FS_QUOTACHECK) + XFS_SICK_FS_QUOTACHECK | \ + XFS_SICK_FS_NLINKS) #define XFS_SICK_RT_PRIMARY (XFS_SICK_RT_BITMAP | \ XFS_SICK_RT_SUMMARY) diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index ef07af9f753d..111c27a6b107 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -281,6 +281,7 @@ static const struct ioctl_sick_map fs_map[] = { { XFS_SICK_FS_GQUOTA, XFS_FSOP_GEOM_SICK_GQUOTA }, { XFS_SICK_FS_PQUOTA, XFS_FSOP_GEOM_SICK_PQUOTA }, { XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK }, + { XFS_SICK_FS_NLINKS, XFS_FSOP_GEOM_SICK_NLINKS }, { 0, 0 }, }; -- cgit v1.2.3 From f1184081ac97625d30c59851944f4c59ae7ddc2b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:58 -0800 Subject: xfs: teach scrub to check file nlinks Create the necessary scrub code to walk the filesystem's directory tree so that we can compute file link counts. Similar to quotacheck, we create an incore shadow array of link count information and then we walk the filesystem a second time to compare the link counts. We need live updates to keep the information up to date during the lengthy scan, so this scrubber remains disabled until the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_fs.h | 3 +- fs/xfs/scrub/common.h | 1 + fs/xfs/scrub/health.c | 1 + fs/xfs/scrub/nlinks.c | 839 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/nlinks.h | 93 ++++++ fs/xfs/scrub/scrub.c | 6 + fs/xfs/scrub/scrub.h | 1 + fs/xfs/scrub/stats.c | 1 + fs/xfs/scrub/trace.c | 2 + fs/xfs/scrub/trace.h | 147 ++++++++- 11 files changed, 1093 insertions(+), 2 deletions(-) create mode 100644 fs/xfs/scrub/nlinks.c create mode 100644 fs/xfs/scrub/nlinks.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 68891e6ee08e..e53a26b58046 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -160,6 +160,7 @@ xfs-y += $(addprefix scrub/, \ ialloc.o \ inode.o \ iscan.o \ + nlinks.o \ parent.o \ readdir.o \ refcount.o \ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index f10d0aa0e337..515cd27d3b3a 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -712,9 +712,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_PQUOTA 23 /* project quotas */ #define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ #define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ +#define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 26 +#define XFS_SCRUB_TYPE_NR 27 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index eb51037cd0d2..529a510dc76f 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -129,6 +129,7 @@ xchk_setup_quotacheck(struct xfs_scrub *sc) } #endif int xchk_setup_fscounters(struct xfs_scrub *sc); +int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 3c9eac070796..34519fbc2d40 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -106,6 +106,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, [XFS_SCRUB_TYPE_QUOTACHECK] = { XHG_FS, XFS_SICK_FS_QUOTACHECK }, + [XFS_SCRUB_TYPE_NLINKS] = { XHG_FS, XFS_SICK_FS_NLINKS }, }; /* Return the health status mask for this scrub type. */ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c new file mode 100644 index 000000000000..c899a50a83da --- /dev/null +++ b/fs/xfs/scrub/nlinks.c @@ -0,0 +1,839 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ag.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" +#include "scrub/readdir.h" + +/* + * Live Inode Link Count Checking + * ============================== + * + * Inode link counts are "summary" metadata, in the sense that they are + * computed as the number of directory entries referencing each file on the + * filesystem. Therefore, we compute the correct link counts by creating a + * shadow link count structure and walking every inode. + */ + +/* Set us up to scrub inode link counts. */ +int +xchk_setup_nlinks( + struct xfs_scrub *sc) +{ + /* Not ready for general consumption yet. */ + return -EOPNOTSUPP; + + sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + return xchk_setup_fs(sc); +} + +/* + * Part 1: Collecting file link counts. For each file, we create a shadow link + * counting structure, then walk the entire directory tree, incrementing parent + * and child link counts for each directory entry seen. + * + * To avoid false corruption reports in part 2, any failure in this part must + * set the INCOMPLETE flag even when a negative errno is returned. This care + * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, + * ECANCELED) that are absorbed into a scrub state flag update by + * xchk_*_process_error. + */ + +/* + * Add a delta to an nlink counter, clamping the value to U32_MAX. Because + * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results + * even if we lose some precision. + */ +static inline void +careful_add( + xfs_nlink_t *nlinkp, + int delta) +{ + uint64_t new_value = (uint64_t)(*nlinkp) + delta; + + BUILD_BUG_ON(XFS_MAXLINK > U32_MAX); + *nlinkp = min_t(uint64_t, new_value, U32_MAX); +} + +/* Update incore link count information. Caller must hold the nlinks lock. */ +STATIC int +xchk_nlinks_update_incore( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + int parents_delta, + int backrefs_delta, + int children_delta) +{ + struct xchk_nlink nl; + int error; + + if (!xnc->nlinks) + return 0; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta, + backrefs_delta, children_delta); + + careful_add(&nl.parents, parents_delta); + careful_add(&nl.backrefs, backrefs_delta); + careful_add(&nl.children, children_delta); + + nl.flags |= XCHK_NLINK_WRITTEN; + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. + */ + error = -ECANCELED; + } + return error; +} + +/* Bump the observed link count for the inode referenced by this entry. */ +STATIC int +xchk_nlinks_collect_dirent( + struct xfs_scrub *sc, + struct xfs_inode *dp, + xfs_dir2_dataptr_t dapos, + const struct xfs_name *name, + xfs_ino_t ino, + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + bool dot = false, dotdot = false; + int error; + + /* Does this name make sense? */ + if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) { + error = -ECANCELED; + goto out_abort; + } + + if (name->len == 1 && name->name[0] == '.') + dot = true; + else if (name->len == 2 && name->name[0] == '.' && + name->name[1] == '.') + dotdot = true; + + /* Don't accept a '.' entry that points somewhere else. */ + if (dot && ino != dp->i_ino) { + error = -ECANCELED; + goto out_abort; + } + + /* Don't accept an invalid inode number. */ + if (!xfs_verify_dir_ino(sc->mp, ino)) { + error = -ECANCELED; + goto out_abort; + } + + /* Update the shadow link counts if we haven't already failed. */ + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_incomplete; + } + + trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name); + + mutex_lock(&xnc->lock); + + /* + * If this is a dotdot entry, it is a back link from dp to ino. How + * we handle this depends on whether or not dp is the root directory. + * + * The root directory is its own parent, so we pretend the dotdot entry + * establishes the "parent" of the root directory. Increment the + * number of parents of the root directory. + * + * Otherwise, increment the number of backrefs pointing back to ino. + */ + if (dotdot) { + if (dp == sc->mp->m_rootip) + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + else + error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link from dp to ino, increment the + * number of parents linking into ino. + */ + if (!dot && !dotdot) { + error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); + if (error) + goto out_unlock; + } + + /* + * If this dirent is a forward link to a subdirectory, increment the + * number of child links of dp. + */ + if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) { + error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1); + if (error) + goto out_unlock; + } + + mutex_unlock(&xnc->lock); + return 0; + +out_unlock: + mutex_unlock(&xnc->lock); +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(sc); + return error; +} + +/* Walk a directory to bump the observed link counts of the children. */ +STATIC int +xchk_nlinks_collect_dir( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *dp) +{ + struct xfs_scrub *sc = xnc->sc; + unsigned int lock_mode; + int error = 0; + + /* Prevent anyone from changing this directory while we walk it. */ + xfs_ilock(dp, XFS_IOLOCK_SHARED); + lock_mode = xfs_ilock_data_map_shared(dp); + + /* + * The dotdot entry of an unlinked directory still points to the last + * parent, but the parent no longer links to this directory. Skip the + * directory to avoid overcounting. + */ + if (VFS_I(dp)->i_nlink == 0) + goto out_unlock; + + /* + * We cannot count file links if the directory looks as though it has + * been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + goto out_abort; + } + + error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc); + if (error == -ECANCELED) { + error = 0; + goto out_unlock; + } + if (error) + goto out_abort; + + xchk_iscan_mark_visited(&xnc->collect_iscan, dp); + goto out_unlock; + +out_abort: + xchk_set_incomplete(sc); + xchk_iscan_abort(&xnc->collect_iscan); +out_unlock: + xfs_iunlock(dp, lock_mode); + xfs_iunlock(dp, XFS_IOLOCK_SHARED); + return error; +} + +/* If this looks like a valid pointer, count it. */ +static inline int +xchk_nlinks_collect_metafile( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + if (!xfs_verify_ino(xnc->sc->mp, ino)) + return 0; + + trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino); + return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0); +} + +/* Bump the link counts of metadata files rooted in the superblock. */ +STATIC int +xchk_nlinks_collect_metafiles( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = xnc->sc->mp; + int error = -ECANCELED; + + + if (xchk_iscan_aborted(&xnc->collect_iscan)) + goto out_incomplete; + + mutex_lock(&xnc->lock); + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino); + if (error) + goto out_abort; + + error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino); + if (error) + goto out_abort; + mutex_unlock(&xnc->lock); + + return 0; + +out_abort: + mutex_unlock(&xnc->lock); + xchk_iscan_abort(&xnc->collect_iscan); +out_incomplete: + xchk_set_incomplete(xnc->sc); + return error; +} + +/* Advance the collection scan cursor for this non-directory file. */ +static inline int +xchk_nlinks_collect_file( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + xfs_ilock(ip, XFS_IOLOCK_SHARED); + xchk_iscan_mark_visited(&xnc->collect_iscan, ip); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return 0; +} + +/* Walk all directories and count inode links. */ +STATIC int +xchk_nlinks_collect( + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + int error; + + /* Count the rt and quota files that are rooted in the superblock. */ + error = xchk_nlinks_collect_metafiles(xnc); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Cancel the transaction to release the log grant space while we scan + * the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) { + if (S_ISDIR(VFS_I(ip)->i_mode)) + error = xchk_nlinks_collect_dir(xnc, ip); + else + error = xchk_nlinks_collect_file(xnc, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->collect_iscan); + if (error) { + xchk_set_incomplete(sc); + /* + * If we couldn't grab an inode that was busy with a state + * change, change the error code so that we exit to userspace + * as quickly as possible. + */ + if (error == -EBUSY) + return -ECANCELED; + return error; + } + + /* + * Switch out for a real transaction in preparation for building a new + * tree. + */ + xchk_trans_cancel(sc); + return xchk_setup_fs(sc); +} + +/* + * Part 2: Comparing file link counters. Walk each inode and compare the link + * counts against our shadow information; and then walk each shadow link count + * structure (that wasn't covered in the first part), comparing it against the + * file. + */ + +/* Read the observed link count for comparison with the actual inode. */ +STATIC int +xchk_nlinks_comparison_read( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino, + struct xchk_nlink *obs) +{ + struct xchk_nlink nl; + int error; + + error = xfarray_load_sparse(xnc->nlinks, ino, &nl); + if (error) + return error; + + nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN); + + error = xfarray_store(xnc->nlinks, ino, &nl); + if (error == -EFBIG) { + /* + * EFBIG means we tried to store data at too high a byte offset + * in the sparse array. IOWs, we cannot complete the check and + * must notify userspace that the check was incomplete. This + * shouldn't really happen outside of the collection phase. + */ + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + if (error) + return error; + + /* Copy the counters, but do not expose the internal state. */ + obs->parents = nl.parents; + obs->backrefs = nl.backrefs; + obs->children = nl.children; + obs->flags = 0; + return 0; +} + +/* Check our link count against an inode. */ +STATIC int +xchk_nlinks_compare_inode( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode *ip) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + uint64_t total_links; + unsigned int actual_nlink; + int error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_scanlock; + } + + error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * If we don't have ftype to get an accurate count of the subdirectory + * entries in this directory, take advantage of the fact that on a + * consistent ftype=0 filesystem, the number of subdirectory + * backreferences (dotdot entries) pointing towards this directory + * should be equal to the number of subdirectory entries in the + * directory. + */ + if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode)) + obs.children = obs.backrefs; + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs); + + /* + * If we found so many parents that we'd overflow i_nlink, we must flag + * this as a corruption. The VFS won't let users increase the link + * count, but it will let them decrease it. + */ + if (total_links > XFS_MAXLINK) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* Link counts should match. */ + if (total_links != actual_nlink) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) { + /* + * The collection phase ignores directories with zero link + * count, so we ignore them here too. + * + * The number of subdirectory backreferences (dotdot entries) + * pointing towards this directory should be equal to the + * number of subdirectory entries in the directory. + */ + if (obs.children != obs.backrefs) + xchk_ino_xref_set_corrupt(sc, ip->i_ino); + } else { + /* + * Non-directories and unlinked directories should not have + * back references. + */ + if (obs.backrefs != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + + /* + * Non-directories and unlinked directories should not have + * children. + */ + if (obs.children != 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + + if (ip == sc->mp->m_rootip) { + /* + * For the root of a directory tree, both the '.' and '..' + * entries should point to the root directory. The dotdot + * entry is counted as a parent of the root /and/ a backref of + * the root directory. + */ + if (obs.parents != 1) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } else if (actual_nlink > 0) { + /* + * Linked files that are not the root directory should have at + * least one parent. + */ + if (obs.parents == 0) { + xchk_ino_set_corrupt(sc, ip->i_ino); + goto out_corrupt; + } + } + +out_corrupt: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + error = -ECANCELED; +out_scanlock: + mutex_unlock(&xnc->lock); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +/* + * Check our link count against an inode that wasn't checked previously. This + * is intended to catch directories with dangling links, though we could be + * racing with inode allocation in other threads. + */ +STATIC int +xchk_nlinks_compare_inum( + struct xchk_nlink_ctrs *xnc, + xfs_ino_t ino) +{ + struct xchk_nlink obs; + struct xfs_mount *mp = xnc->sc->mp; + struct xfs_trans *tp = xnc->sc->tp; + struct xfs_buf *agi_bp; + struct xfs_inode *ip; + int error; + + /* + * The first iget failed, so try again with the variant that returns + * either an incore inode or the AGI buffer. If the function returns + * EINVAL/ENOENT, it should have passed us the AGI buffer so that we + * can guarantee that the inode won't be allocated while we check for + * a zero link count in the observed link count data. + */ + error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip); + if (!error) { + /* Actually got an inode, so use the inode compare. */ + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_irele(xnc->sc, ip); + return error; + } + if (error == -ENOENT || error == -EINVAL) { + /* No inode was found. Check for zero link count below. */ + error = 0; + } + if (error) + goto out_agi; + + /* Ensure that we have protected against inode allocation/freeing. */ + if (agi_bp == NULL) { + ASSERT(agi_bp != NULL); + xchk_set_incomplete(xnc->sc); + return -ECANCELED; + } + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + xchk_set_incomplete(xnc->sc); + error = -ECANCELED; + goto out_agi; + } + + mutex_lock(&xnc->lock); + error = xchk_nlinks_comparison_read(xnc, ino, &obs); + if (error) + goto out_scanlock; + + trace_xchk_nlinks_check_zero(mp, ino, &obs); + + /* + * If we can't grab the inode, the link count had better be zero. We + * still hold the AGI to prevent inode allocation/freeing. + */ + if (xchk_nlink_total(NULL, &obs) != 0) { + xchk_ino_set_corrupt(xnc->sc, ino); + error = -ECANCELED; + } + +out_scanlock: + mutex_unlock(&xnc->lock); +out_agi: + if (agi_bp) + xfs_trans_brelse(tp, agi_bp); + return error; +} + +/* + * Try to visit every inode in the filesystem to compare the link count. Move + * on if we can't grab an inode, since we'll revisit unchecked nlink records in + * the second part. + */ +static int +xchk_nlinks_compare_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Compare the link counts we observed against the live information. */ +STATIC int +xchk_nlinks_compare( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink nl; + struct xfs_scrub *sc = xnc->sc; + struct xfs_inode *ip; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Create a new empty transaction so that we can advance the iscan + * cursor without deadlocking if the inobt has a cycle and push on the + * inactivation workqueue. + */ + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* + * Use the inobt to walk all allocated inodes to compare the link + * counts. Inodes skipped by _compare_iter will be tried again in the + * next phase of the scan. + */ + xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan); + while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) { + error = xchk_nlinks_compare_inode(xnc, ip); + xchk_iscan_mark_visited(&xnc->compare_iscan, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + if (error) + return error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* + * Walk all the non-null nlink observations that weren't checked in the + * previous step. + */ + mutex_lock(&xnc->lock); + while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) { + xfs_ino_t ino = cur - 1; + + if (nl.flags & XCHK_NLINK_COMPARE_SCANNED) + continue; + + mutex_unlock(&xnc->lock); + + error = xchk_nlinks_compare_inum(xnc, ino); + if (error) + return error; + + if (xchk_should_terminate(xnc->sc, &error)) + return error; + + mutex_lock(&xnc->lock); + } + mutex_unlock(&xnc->lock); + + return error; +} + +/* Tear down everything associated with a nlinks check. */ +static void +xchk_nlinks_teardown_scan( + void *priv) +{ + struct xchk_nlink_ctrs *xnc = priv; + + xfarray_destroy(xnc->nlinks); + xnc->nlinks = NULL; + + xchk_iscan_teardown(&xnc->collect_iscan); + mutex_destroy(&xnc->lock); + xnc->sc = NULL; +} + +/* + * Scan all inodes in the entire filesystem to generate link count data. If + * the scan is successful, the counts will be left alive for a repair. If any + * error occurs, we'll tear everything down. + */ +STATIC int +xchk_nlinks_setup_scan( + struct xfs_scrub *sc, + struct xchk_nlink_ctrs *xnc) +{ + struct xfs_mount *mp = sc->mp; + char *descr; + unsigned long long max_inos; + xfs_agnumber_t last_agno = mp->m_sb.sb_agcount - 1; + xfs_agino_t first_agino, last_agino; + int error; + + ASSERT(xnc->sc == NULL); + xnc->sc = sc; + + mutex_init(&xnc->lock); + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan); + + /* + * Set up enough space to store an nlink record for the highest + * possible inode number in this system. + */ + xfs_agino_range(mp, last_agno, &first_agino, &last_agino); + max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1; + descr = xchk_xfile_descr(sc, "file link counts"); + error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos), + sizeof(struct xchk_nlink), &xnc->nlinks); + kfree(descr); + if (error) + goto out_teardown; + + /* Use deferred cleanup to pass the inode link count data to repair. */ + sc->buf_cleanup = xchk_nlinks_teardown_scan; + return 0; + +out_teardown: + xchk_nlinks_teardown_scan(xnc); + return error; +} + +/* Scrub the link count of all inodes on the filesystem. */ +int +xchk_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error = 0; + + /* Set ourselves up to check link counts on the live filesystem. */ + error = xchk_nlinks_setup_scan(sc, xnc); + if (error) + return error; + + /* Walk all inodes, picking up link count information. */ + error = xchk_nlinks_collect(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Fail fast if we're not playing with a full dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare link counts. */ + error = xchk_nlinks_compare(xnc); + if (!xchk_xref_process_error(sc, 0, 0, &error)) + return error; + + /* Check one last time for an incomplete dataset. */ + if (xchk_iscan_aborted(&xnc->collect_iscan)) + xchk_set_incomplete(sc); + + return 0; +} diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h new file mode 100644 index 000000000000..69a3460c5e52 --- /dev/null +++ b/fs/xfs/scrub/nlinks.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_NLINKS_H__ +#define __XFS_SCRUB_NLINKS_H__ + +/* Live link count control structure. */ +struct xchk_nlink_ctrs { + struct xfs_scrub *sc; + + /* Shadow link count data and its mutex. */ + struct xfarray *nlinks; + struct mutex lock; + + /* + * The collection step uses a separate iscan context from the compare + * step because the collection iscan coordinates live updates to the + * observation data while this scanner is running. The compare iscan + * is secondary and can be reinitialized as needed. + */ + struct xchk_iscan collect_iscan; + struct xchk_iscan compare_iscan; +}; + +/* + * In-core link counts for a given inode in the filesystem. + * + * For an empty rootdir, the directory entries and the field to which they are + * accounted are as follows: + * + * Root directory: + * + * . points to self (root.child) + * .. points to self (root.parent) + * f1 points to a child file (f1.parent) + * d1 points to a child dir (d1.parent, root.child) + * + * Subdirectory d1: + * + * . points to self (d1.child) + * .. points to root dir (root.backref) + * f2 points to child file (f2.parent) + * f3 points to root.f1 (f1.parent) + * + * root.nlink == 3 (root.dot, root.dotdot, root.d1) + * d1.nlink == 2 (root.d1, d1.dot) + * f1.nlink == 2 (root.f1, d1.f3) + * f2.nlink == 1 (d1.f2) + */ +struct xchk_nlink { + /* Count of forward links from parent directories to this file. */ + xfs_nlink_t parents; + + /* + * Count of back links to this parent directory from child + * subdirectories. + */ + xfs_nlink_t backrefs; + + /* + * Count of forward links from this directory to all child files and + * the number of dot entries. Should be zero for non-directories. + */ + xfs_nlink_t children; + + /* Record state flags */ + unsigned int flags; +}; + +/* + * This incore link count has been written at least once. We never want to + * store an xchk_nlink that looks uninitialized. + */ +#define XCHK_NLINK_WRITTEN (1U << 0) + +/* This data item was seen by the check-time compare function. */ +#define XCHK_NLINK_COMPARE_SCANNED (1U << 1) + +/* Compute total link count, using large enough variables to detect overflow. */ +static inline uint64_t +xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live) +{ + uint64_t ret = live->parents; + + /* Add one link count for the dot entry of any linked directory. */ + if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink) + ret++; + return ret + live->children; +} + +#endif /* __XFS_SCRUB_NLINKS_H__ */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 9112c0985c62..8c60774d5f34 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -369,6 +369,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_quotacheck, .repair = xrep_quotacheck, }, + [XFS_SCRUB_TYPE_NLINKS] = { /* inode link counts */ + .type = ST_FS, + .setup = xchk_setup_nlinks, + .scrub = xchk_nlinks, + .repair = xrep_notsupported, + }, }; static int diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 5cd4550155f2..de6b45f99dd5 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -183,6 +183,7 @@ xchk_quotacheck(struct xfs_scrub *sc) } #endif int xchk_fscounters(struct xfs_scrub *sc); +int xchk_nlinks(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index d716a432227b..b4ef1ebe28ab 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -78,6 +78,7 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = { [XFS_SCRUB_TYPE_PQUOTA] = "prjquota", [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters", [XFS_SCRUB_TYPE_QUOTACHECK] = "quotacheck", + [XFS_SCRUB_TYPE_NLINKS] = "nlinks", }; /* Format the scrub stats into a text buffer, similar to pcp style. */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 5ed75cc33b92..2d5a330afe10 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -17,11 +17,13 @@ #include "xfs_quota.h" #include "xfs_quota_defs.h" #include "xfs_da_format.h" +#include "xfs_dir2.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/quota.h" #include "scrub/iscan.h" +#include "scrub/nlinks.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index fedcebf90a42..0deea8f18a30 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -23,6 +23,7 @@ struct xfarray; struct xfarray_sortinfo; struct xchk_dqiter; struct xchk_iscan; +struct xchk_nlink; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -67,6 +68,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -94,7 +96,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ - { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" } + { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ + { XFS_SCRUB_TYPE_NLINKS, "nlinks" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ @@ -1318,6 +1321,148 @@ TRACE_EVENT(xchk_iscan_iget_retry_wait, __entry->retry_delay) ); +TRACE_EVENT(xchk_nlinks_collect_dirent, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp, + xfs_ino_t ino, const struct xfs_name *name), + TP_ARGS(mp, dp, ino, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(xfs_ino_t, ino) + __field(unsigned int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp->i_ino; + __entry->ino = ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->namelen, + __get_str(name)) +); + +TRACE_EVENT(xchk_nlinks_collect_metafile, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino), + TP_ARGS(mp, ino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +); + +TRACE_EVENT(xchk_nlinks_check_zero, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live), + TP_ARGS(mp, ino, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents, + __entry->backrefs, + __entry->children) +); + +TRACE_EVENT(xchk_nlinks_update_incore, + TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, + const struct xchk_nlink *live, int parents_delta, + int backrefs_delta, int children_delta), + TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + __field(int, parents_delta) + __field(int, backrefs_delta) + __field(int, children_delta) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ino; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + __entry->parents_delta = parents_delta; + __entry->backrefs_delta = backrefs_delta; + __entry->children_delta = children_delta; + ), + TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->parents_delta, + __entry->parents, + __entry->backrefs_delta, + __entry->backrefs, + __entry->children_delta, + __entry->children) +); + +DECLARE_EVENT_CLASS(xchk_nlinks_diff_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, + const struct xchk_nlink *live), + TP_ARGS(mp, ip, live), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint8_t, ftype) + __field(xfs_nlink_t, nlink) + __field(xfs_nlink_t, parents) + __field(xfs_nlink_t, backrefs) + __field(xfs_nlink_t, children) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode); + __entry->nlink = VFS_I(ip)->i_nlink; + __entry->parents = live->parents; + __entry->backrefs = live->backrefs; + __entry->children = live->children; + ), + TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR), + __entry->nlink, + __entry->parents, + __entry->backrefs, + __entry->children) +); +#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \ +DEFINE_EVENT(xchk_nlinks_diff_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \ + const struct xchk_nlink *live), \ + TP_ARGS(mp, ip, live)) +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode); + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) -- cgit v1.2.3 From 86a1746eea91c6db983e6ccd3f846708746e47c2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:30:59 -0800 Subject: xfs: track directory entry updates during live nlinks fsck Create the necessary hooks in the directory operations (create/link/unlink/rename) code so that our live nlink scrub code can stay up to date with link count updates in the rest of the filesystem. This will be the means to keep our shadow link count information up to date while the scan runs in real time. In online fsck part 2, we'll use these same hooks to handle repairs to directories and parent pointer information. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/common.c | 3 ++ fs/xfs/scrub/nlinks.c | 93 ++++++++++++++++++++++++++++++++++++++- fs/xfs/scrub/nlinks.h | 6 +++ fs/xfs/scrub/scrub.c | 3 ++ fs/xfs/scrub/scrub.h | 4 +- fs/xfs/scrub/trace.h | 33 ++++++++++++++ fs/xfs/xfs_inode.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 31 +++++++++++++ fs/xfs/xfs_mount.h | 3 ++ fs/xfs/xfs_super.c | 2 + fs/xfs/xfs_symlink.c | 1 + 11 files changed, 293 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index c5a6c47d3df2..699092195f41 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1302,6 +1302,9 @@ xchk_fsgates_enable( if (scrub_fsgates & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_enable(); + if (scrub_fsgates & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_enable(); + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index c899a50a83da..341ab737f9c5 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -43,8 +43,7 @@ int xchk_setup_nlinks( struct xfs_scrub *sc) { - /* Not ready for general consumption yet. */ - return -EOPNOTSUPP; + xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS); sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS); if (!sc->buf) @@ -63,6 +62,21 @@ xchk_setup_nlinks( * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by * xchk_*_process_error. + * + * Because we are scanning a live filesystem, it's possible that another thread + * will try to update the link counts for an inode that we've already scanned. + * This will cause our counts to be incorrect. Therefore, we hook all + * directory entry updates because that is when link count updates occur. By + * shadowing transaction updates in this manner, live nlink check can ensure by + * locking the inode and the shadow structure that its own copies are not out + * of date. Because the hook code runs in a different process context from the + * scrub code and the scrub state flags are not accessed atomically, failures + * in the hook code must abort the iscan and the scrubber must notice the + * aborted scan and set the incomplete flag. + * + * Note that we use jump labels and srcu notifier hooks to minimize the + * overhead when live nlinks is /not/ running. Locking order for nlink + * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock. */ /* @@ -120,6 +134,63 @@ xchk_nlinks_update_incore( return error; } +/* + * Apply a link count change from the regular filesystem into our shadow link + * count structure based on a directory update in progress. + */ +STATIC int +xchk_nlinks_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_dir_update_params *p = data; + struct xchk_nlink_ctrs *xnc; + int error; + + xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb); + + trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino, + p->delta, p->name->name, p->name->len); + + /* + * If we've already scanned @dp, update the number of parents that link + * to @ip. If @ip is a subdirectory, update the number of child links + * going out of @dp. + */ + if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta, + 0, 0); + if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode)) + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + 0, p->delta); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + /* + * If @ip is a subdirectory and we've already scanned it, update the + * number of backrefs pointing to @dp. + */ + if (S_ISDIR(VFS_IC(p->ip)->i_mode) && + xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) { + mutex_lock(&xnc->lock); + error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0, + p->delta, 0); + mutex_unlock(&xnc->lock); + if (error) + goto out_abort; + } + + return NOTIFY_DONE; + +out_abort: + xchk_iscan_abort(&xnc->collect_iscan); + return NOTIFY_DONE; +} + /* Bump the observed link count for the inode referenced by this entry. */ STATIC int xchk_nlinks_collect_dirent( @@ -747,6 +818,11 @@ xchk_nlinks_teardown_scan( { struct xchk_nlink_ctrs *xnc = priv; + /* Discourage any hook functions that might be running. */ + xchk_iscan_abort(&xnc->collect_iscan); + + xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook); + xfarray_destroy(xnc->nlinks); xnc->nlinks = NULL; @@ -793,6 +869,19 @@ xchk_nlinks_setup_scan( if (error) goto out_teardown; + /* + * Hook into the directory entry code so that we can capture updates to + * file link counts. The hook only triggers for inodes that were + * already scanned, and the scanner thread takes each inode's ILOCK, + * which means that any in-progress inode updates will finish before we + * can scan the inode. + */ + ASSERT(sc->flags & XCHK_FSGATES_DIRENTS); + xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update); + error = xfs_dir_hook_add(mp, &xnc->dhook); + if (error) + goto out_teardown; + /* Use deferred cleanup to pass the inode link count data to repair. */ sc->buf_cleanup = xchk_nlinks_teardown_scan; return 0; diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h index 69a3460c5e52..ea6408e1c183 100644 --- a/fs/xfs/scrub/nlinks.h +++ b/fs/xfs/scrub/nlinks.h @@ -22,6 +22,12 @@ struct xchk_nlink_ctrs { */ struct xchk_iscan collect_iscan; struct xchk_iscan compare_iscan; + + /* + * Hook into directory updates so that we can receive live updates + * from other writer threads. + */ + struct xfs_dir_hook dhook; }; /* diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 8c60774d5f34..883c47b6c686 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -160,6 +160,9 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_QUOTA) xfs_dqtrx_hook_disable(); + if (sc->flags & XCHK_FSGATES_DIRENTS) + xfs_dir_hook_disable(); + sc->flags &= ~XCHK_FSGATES_ALL; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index de6b45f99dd5..f99a3c21d02e 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -122,6 +122,7 @@ struct xfs_scrub { #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ +#define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */ #define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ @@ -132,7 +133,8 @@ struct xfs_scrub { * must be enabled during scrub setup and can only be torn down afterwards. */ #define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \ - XCHK_FSGATES_QUOTA) + XCHK_FSGATES_QUOTA | \ + XCHK_FSGATES_DIRENTS) /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 0deea8f18a30..9512170ea9a7 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -116,6 +116,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ + { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } @@ -1363,6 +1364,38 @@ TRACE_EVENT(xchk_nlinks_collect_metafile, __entry->ino) ); +TRACE_EVENT(xchk_nlinks_live_update, + TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp, + int action, xfs_ino_t ino, int delta, + const char *name, unsigned int namelen), + TP_ARGS(mp, dp, action, ino, delta, name, namelen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dir) + __field(int, action) + __field(xfs_ino_t, ino) + __field(int, delta) + __field(unsigned int, namelen) + __dynamic_array(char, name, namelen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->dir = dp ? dp->i_ino : NULLFSINO; + __entry->action = action; + __entry->ino = ino; + __entry->delta = delta; + __entry->namelen = namelen; + memcpy(__get_str(name), name, namelen); + ), + TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dir, + __entry->ino, + __entry->delta, + __entry->namelen, + __get_str(name)) +); + TRACE_EVENT(xchk_nlinks_check_zero, TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino, const struct xchk_nlink *live), diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d6635d219527..e8845287debd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -925,6 +925,81 @@ xfs_bumplink( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of directory live update hooks. + * If the compiler supports jump labels, the static branch will be replaced by + * a nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); + +void +xfs_dir_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_dir_hooks_switch); +} + +void +xfs_dir_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_dir_hooks_switch); +} + +/* Call hooks for a directory update relating to a child dirent update. */ +inline void +xfs_dir_update_hook( + struct xfs_inode *dp, + struct xfs_inode *ip, + int delta, + const struct xfs_name *name) +{ + if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { + struct xfs_dir_update_params p = { + .dp = dp, + .ip = ip, + .delta = delta, + .name = name, + }; + struct xfs_mount *mp = ip->i_mount; + + xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); + } +} + +/* Call the specified function during a directory update. */ +int +xfs_dir_hook_add( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Stop calling the specified function during a directory update. */ +void +xfs_dir_hook_del( + struct xfs_mount *mp, + struct xfs_dir_hook *hook) +{ + xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); +} + +/* Configure directory update hook functions. */ +void +xfs_dir_hook_setup( + struct xfs_dir_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->dirent_hook, mod_fn); +} +#endif /* CONFIG_XFS_LIVE_HOOKS */ + int xfs_create( struct mnt_idmap *idmap, @@ -1035,6 +1110,12 @@ xfs_create( xfs_bumplink(tp, dp); } + /* + * Create ip with a reference from dp, and add '.' and '..' references + * if it's a directory. + */ + xfs_dir_update_hook(dp, ip, 1, name); + /* * If this is a synchronous mount, make sure that the * create transaction goes to disk before returning to @@ -1249,6 +1330,7 @@ xfs_link( xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); xfs_bumplink(tp, sip); + xfs_dir_update_hook(tdp, sip, 1, target_name); /* * If this is a synchronous mount, make sure that the @@ -2562,6 +2644,12 @@ xfs_remove( goto out_trans_cancel; } + /* + * Drop the link from dp to ip, and if ip was a directory, remove the + * '.' and '..' references since we freed the directory. + */ + xfs_dir_update_hook(dp, ip, -1, name); + /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to @@ -2752,6 +2840,20 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); + + /* + * Inform our hook clients that we've finished an exchange operation as + * follows: removed the source and target files from their directories; + * added the target to the source directory; and added the source to + * the target directory. All inodes are locked, so it's ok to model a + * rename this way so long as we say we deleted entries before we add + * new ones. + */ + xfs_dir_update_hook(dp1, ip1, -1, name1); + xfs_dir_update_hook(dp2, ip2, -1, name2); + xfs_dir_update_hook(dp1, ip2, 1, name1); + xfs_dir_update_hook(dp2, ip1, 1, name2); + return xfs_finish_rename(tp); out_trans_abort: @@ -3135,6 +3237,21 @@ retry: if (new_parent) xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + /* + * Inform our hook clients that we've finished a rename operation as + * follows: removed the source and target files from their directories; + * that we've added the source to the target directory; and finally + * that we've added the whiteout, if there was one. All inodes are + * locked, so it's ok to model a rename this way so long as we say we + * deleted entries before we add new ones. + */ + if (target_ip) + xfs_dir_update_hook(target_dp, target_ip, -1, target_name); + xfs_dir_update_hook(src_dp, src_ip, -1, src_name); + xfs_dir_update_hook(target_dp, src_ip, 1, target_name); + if (wip) + xfs_dir_update_hook(src_dp, wip, 1, src_name); + error = xfs_finish_rename(tp); if (wip) xfs_irele(wip); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7bbdc7009e7d..ab46ffb3ac19 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) return &ip->i_vnode; } +/* convert from const xfs inode to const vfs inode */ +static inline const struct inode *VFS_IC(const struct xfs_inode *ip) +{ + return &ip->i_vnode; +} + /* * For regular files we only update the on-disk filesize when actually * writing data back to disk. Until then only the copy in the VFS inode @@ -626,4 +632,29 @@ bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, xfs_filblks_t *dblocks, xfs_filblks_t *rblocks); +struct xfs_dir_update_params { + const struct xfs_inode *dp; + const struct xfs_inode *ip; + const struct xfs_name *name; + int delta; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS +void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip, + int delta, const struct xfs_name *name); + +struct xfs_dir_hook { + struct xfs_hook dirent_hook; +}; + +void xfs_dir_hook_disable(void); +void xfs_dir_hook_enable(void); + +int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook); +void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn); +#else +# define xfs_dir_update_hook(dp, ip, delta, name) ((void)0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 503fe3c7edbf..e86dfe67894f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -252,6 +252,9 @@ typedef struct xfs_mount { /* cpus that have inodes queued for inactivation */ struct cpumask m_inodegc_cpumask; + + /* Hook to feed dirent updates to an active online repair. */ + struct xfs_hooks m_dir_update_hooks; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b31652fa7004..74e87ed5eee1 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2011,6 +2011,8 @@ static int xfs_init_fs_context( mp->m_logbsize = -1; mp->m_allocsize_log = 16; /* 64k */ + xfs_hooks_init(&mp->m_dir_update_hooks); + fc->s_fs_info = mp; fc->ops = &xfs_context_ops; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index c2dc8c501bdc..e73692fbe179 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -322,6 +322,7 @@ xfs_symlink( goto out_trans_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + xfs_dir_update_hook(dp, ip, 1, link_name); /* * If this is a synchronous mount, make sure that the -- cgit v1.2.3 From 6b631c60c90a1e5264bc9bcdca2c0adb492d7a62 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:31:00 -0800 Subject: xfs: teach repair to fix file nlinks Fix the file link counts since we just computed the correct ones. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/nlinks.c | 4 +- fs/xfs/scrub/nlinks.h | 5 +- fs/xfs/scrub/nlinks_repair.c | 223 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 3 + 7 files changed, 237 insertions(+), 3 deletions(-) create mode 100644 fs/xfs/scrub/nlinks_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index e53a26b58046..253744092915 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -194,6 +194,7 @@ xfs-y += $(addprefix scrub/, \ ialloc_repair.o \ inode_repair.o \ newbt.o \ + nlinks_repair.o \ reap.o \ refcount_repair.o \ repair.o \ diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c index 341ab737f9c5..8a7d9557897c 100644 --- a/fs/xfs/scrub/nlinks.c +++ b/fs/xfs/scrub/nlinks.c @@ -61,7 +61,9 @@ xchk_setup_nlinks( * set the INCOMPLETE flag even when a negative errno is returned. This care * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by - * xchk_*_process_error. + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. * * Because we are scanning a live filesystem, it's possible that another thread * will try to update the link counts for an inode that we've already scanned. diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h index ea6408e1c183..a950f3daf204 100644 --- a/fs/xfs/scrub/nlinks.h +++ b/fs/xfs/scrub/nlinks.h @@ -81,9 +81,12 @@ struct xchk_nlink { */ #define XCHK_NLINK_WRITTEN (1U << 0) -/* This data item was seen by the check-time compare function. */ +/* Already checked this link count record. */ #define XCHK_NLINK_COMPARE_SCANNED (1U << 1) +/* Already made a repair with this link count record. */ +#define XREP_NLINK_DIRTY (1U << 2) + /* Compute total link count, using large enough variables to detect overflow. */ static inline uint64_t xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live) diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c new file mode 100644 index 000000000000..b87618322f55 --- /dev/null +++ b/fs/xfs/scrub/nlinks_repair.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" +#include "xfs_iwalk.h" +#include "xfs_ialloc.h" +#include "xfs_sb.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/nlinks.h" +#include "scrub/trace.h" + +/* + * Live Inode Link Count Repair + * ============================ + * + * Use the live inode link count information that we collected to replace the + * nlink values of the incore inodes. A scrub->repair cycle should have left + * the live data and hooks active, so this is safe so long as we make sure the + * inode is locked. + */ + +/* + * Correct the link count of the given inode. Because we have to grab locks + * and resources in a certain order, it's possible that this will be a no-op. + */ +STATIC int +xrep_nlinks_repair_inode( + struct xchk_nlink_ctrs *xnc) +{ + struct xchk_nlink obs; + struct xfs_scrub *sc = xnc->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + uint64_t total_links; + uint64_t actual_nlink; + bool dirty = false; + int error; + + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_trans_ijoin(sc->tp, ip, 0); + + mutex_lock(&xnc->lock); + + if (xchk_iscan_aborted(&xnc->collect_iscan)) { + error = -ECANCELED; + goto out_scanlock; + } + + error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs); + if (error) + goto out_scanlock; + + /* + * We're done accessing the shared scan data, so we can drop the lock. + * We still hold @ip's ILOCK, so its link count cannot change. + */ + mutex_unlock(&xnc->lock); + + total_links = xchk_nlink_total(ip, &obs); + actual_nlink = VFS_I(ip)->i_nlink; + + /* + * Non-directories cannot have directories pointing up to them. + * + * We previously set error to zero, but set it again because one static + * checker author fears that programmers will fail to maintain this + * invariant and built their tool to flag this as a security risk. A + * different tool author made their bot complain about the redundant + * store. This is a never-ending and stupid battle; both tools missed + * *actual bugs* elsewhere; and I no longer care. + */ + if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + error = 0; + goto out_trans; + } + + /* + * We did not find any links to this inode. If the inode agrees, we + * have nothing further to do. If not, the inode has a nonzero link + * count and we don't have anywhere to graft the child onto. Dropping + * a live inode's link count to zero can cause unexpected shutdowns in + * inactivation, so leave it alone. + */ + if (total_links == 0) { + if (actual_nlink != 0) + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + /* Commit the new link count if it changed. */ + if (total_links != actual_nlink) { + if (total_links > XFS_MAXLINK) { + trace_xrep_nlinks_unfixable_inode(mp, ip, &obs); + goto out_trans; + } + + trace_xrep_nlinks_update_inode(mp, ip, &obs); + + set_nlink(VFS_I(ip), total_links); + dirty = true; + } + + if (!dirty) { + error = 0; + goto out_trans; + } + + xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE); + + error = xrep_trans_commit(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; + +out_scanlock: + mutex_unlock(&xnc->lock); +out_trans: + xchk_trans_cancel(sc); + xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + return error; +} + +/* + * Try to visit every inode in the filesystem for repairs. Move on if we can't + * grab an inode, since we're still making forward progress. + */ +static int +xrep_nlinks_iter( + struct xchk_nlink_ctrs *xnc, + struct xfs_inode **ipp) +{ + int error; + + do { + error = xchk_iscan_iter(&xnc->compare_iscan, ipp); + } while (error == -EBUSY); + + return error; +} + +/* Commit the new inode link counters. */ +int +xrep_nlinks( + struct xfs_scrub *sc) +{ + struct xchk_nlink_ctrs *xnc = sc->buf; + int error; + + /* + * We need ftype for an accurate count of the number of child + * subdirectory links. Child subdirectories with a back link (dotdot + * entry) but no forward link are unfixable, so we cannot repair the + * link count of the parent directory based on the back link count + * alone. Filesystems without ftype support are rare (old V4) so we + * just skip out here. + */ + if (!xfs_has_ftype(sc->mp)) + return -EOPNOTSUPP; + + /* + * Use the inobt to walk all allocated inodes to compare and fix the + * link counts. Retry iget every tenth of a second for up to 30 + * seconds -- even if repair misses a few inodes, we still try to fix + * as many of them as we can. + */ + xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan); + ASSERT(sc->ip == NULL); + + while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) { + /* + * Commit the scrub transaction so that we can create repair + * transactions with the correct reservations. + */ + xchk_trans_cancel(sc); + + error = xrep_nlinks_repair_inode(xnc); + xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip); + xchk_irele(sc, sc->ip); + sc->ip = NULL; + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + + /* + * Create a new empty transaction so that we can advance the + * iscan cursor without deadlocking if the inobt has a cycle. + * We can only push the inactivation workqueues with an empty + * transaction. + */ + error = xchk_trans_alloc_empty(sc); + if (error) + break; + } + xchk_iscan_iter_finish(&xnc->compare_iscan); + xchk_iscan_teardown(&xnc->compare_iscan); + + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index fdfa06699921..8edac0150e96 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -116,6 +116,7 @@ int xrep_inode(struct xfs_scrub *sc); int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); +int xrep_nlinks(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); @@ -196,6 +197,7 @@ xrep_setup_nothing( #define xrep_rtbitmap xrep_notsupported #define xrep_quota xrep_notsupported #define xrep_quotacheck xrep_notsupported +#define xrep_nlinks xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 883c47b6c686..c0b99184bb3e 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -376,7 +376,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_nlinks, .scrub = xchk_nlinks, - .repair = xrep_notsupported, + .repair = xrep_nlinks, }, }; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 9512170ea9a7..c9b6b0e0bf11 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2185,6 +2185,9 @@ DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot); #endif /* CONFIG_XFS_QUOTA */ +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); +DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ -- cgit v1.2.3 From 0b8686f19879d896bbe2d3e893f433a08160452d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:31:01 -0800 Subject: xfs: separate the marking of sick and checked metadata Split the setting of the sick and checked masks into separate functions as part of preparing to add the ability for regular runtime fs code (i.e. not scrub) to mark metadata structures sick when corruptions are found. Improve the documentation of libxfs' requirements for helper behavior. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_health.h | 23 +++++++++++++++- fs/xfs/scrub/health.c | 8 +++--- fs/xfs/xfs_health.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trace.h | 4 +++ 4 files changed, 95 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 2bfe2dc404a1..bec7adf9fcf7 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -111,24 +111,45 @@ struct xfs_fsop_geom; XFS_SICK_INO_DIR_ZAPPED | \ XFS_SICK_INO_SYMLINK_ZAPPED) -/* These functions must be provided by the xfs implementation. */ +/* + * These functions must be provided by the xfs implementation. Function + * behavior with respect to the first argument should be as follows: + * + * xfs_*_mark_sick: Set the sick flags and do not set checked flags. + * Runtime code should call this upon encountering + * a corruption. + * + * xfs_*_mark_corrupt: Set the sick and checked flags simultaneously. + * Fsck tools should call this when corruption is + * found. + * + * xfs_*_mark_healthy: Clear the sick flags and set the checked flags. + * Fsck tools should call this after correcting errors. + * + * xfs_*_measure_sickness: Return the sick and check status in the provided + * out parameters. + */ void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_fs_mark_corrupt(struct xfs_mount *mp, unsigned int mask); void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask); +void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask); void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); +void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask); void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick, unsigned int *checked); void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask); +void xfs_inode_mark_corrupt(struct xfs_inode *ip, unsigned int mask); void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask); void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, unsigned int *checked); diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 34519fbc2d40..c682c7093353 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -175,7 +175,7 @@ xchk_update_health( case XHG_AG: pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); if (bad) - xfs_ag_mark_sick(pag, sc->sick_mask); + xfs_ag_mark_corrupt(pag, sc->sick_mask); else xfs_ag_mark_healthy(pag, sc->sick_mask); xfs_perag_put(pag); @@ -184,19 +184,19 @@ xchk_update_health( if (!sc->ip) return; if (bad) - xfs_inode_mark_sick(sc->ip, sc->sick_mask); + xfs_inode_mark_corrupt(sc->ip, sc->sick_mask); else xfs_inode_mark_healthy(sc->ip, sc->sick_mask); break; case XHG_FS: if (bad) - xfs_fs_mark_sick(sc->mp, sc->sick_mask); + xfs_fs_mark_corrupt(sc->mp, sc->sick_mask); else xfs_fs_mark_healthy(sc->mp, sc->sick_mask); break; case XHG_RT: if (bad) - xfs_rt_mark_sick(sc->mp, sc->sick_mask); + xfs_rt_mark_corrupt(sc->mp, sc->sick_mask); else xfs_rt_mark_healthy(sc->mp, sc->sick_mask); break; diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 111c27a6b107..e727a46a95a4 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -96,6 +96,20 @@ xfs_fs_mark_sick( ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); trace_xfs_fs_mark_sick(mp, mask); + spin_lock(&mp->m_sb_lock); + mp->m_fs_sick |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark per-fs metadata as having been checked and found unhealthy by fsck. */ +void +xfs_fs_mark_corrupt( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + trace_xfs_fs_mark_corrupt(mp, mask); + spin_lock(&mp->m_sb_lock); mp->m_fs_sick |= mask; mp->m_fs_checked |= mask; @@ -139,6 +153,20 @@ xfs_rt_mark_sick( ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); trace_xfs_rt_mark_sick(mp, mask); + spin_lock(&mp->m_sb_lock); + mp->m_rt_sick |= mask; + spin_unlock(&mp->m_sb_lock); +} + +/* Mark realtime metadata as having been checked and found unhealthy by fsck. */ +void +xfs_rt_mark_corrupt( + struct xfs_mount *mp, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + trace_xfs_rt_mark_corrupt(mp, mask); + spin_lock(&mp->m_sb_lock); mp->m_rt_sick |= mask; mp->m_rt_checked |= mask; @@ -182,6 +210,20 @@ xfs_ag_mark_sick( ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask); + spin_lock(&pag->pag_state_lock); + pag->pag_sick |= mask; + spin_unlock(&pag->pag_state_lock); +} + +/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */ +void +xfs_ag_mark_corrupt( + struct xfs_perag *pag, + unsigned int mask) +{ + ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask); + spin_lock(&pag->pag_state_lock); pag->pag_sick |= mask; pag->pag_checked |= mask; @@ -225,6 +267,29 @@ xfs_inode_mark_sick( ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_sick(ip, mask); + spin_lock(&ip->i_flags_lock); + ip->i_sick |= mask; + spin_unlock(&ip->i_flags_lock); + + /* + * Keep this inode around so we don't lose the sickness report. Scrub + * grabs inodes with DONTCACHE assuming that most inode are ok, which + * is not the case here. + */ + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); +} + +/* Mark inode metadata as having been checked and found unhealthy by fsck. */ +void +xfs_inode_mark_corrupt( + struct xfs_inode *ip, + unsigned int mask) +{ + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); + trace_xfs_inode_mark_corrupt(ip, mask); + spin_lock(&ip->i_flags_lock); ip->i_sick |= mask; ip->i_checked |= mask; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c7e57efe0356..667ecae469cc 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3926,9 +3926,11 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name, \ TP_PROTO(struct xfs_mount *mp, unsigned int flags), \ TP_ARGS(mp, flags)) DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt); DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy); DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption); DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick); +DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt); DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy); DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption); @@ -3955,6 +3957,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name, \ unsigned int flags), \ TP_ARGS(mp, agno, flags)) DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick); +DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt); DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy); DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption); @@ -3980,6 +3983,7 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned int flags), \ TP_ARGS(ip, flags)) DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick); +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); TRACE_EVENT(xfs_iwalk_ag, -- cgit v1.2.3 From 50645ce8822d23ae3e002d3bee775fa8c315f957 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:31:02 -0800 Subject: xfs: report fs corruption errors to the health tracking system Whenever we encounter corrupt fs metadata, we should report that to the health monitoring system for later reporting. A convenient program for identifying places to insert xfs_*_mark_sick calls is as follows: #!/bin/bash # Detect missing calls to xfs_*_mark_sick filter=cat tty -s && filter=less git grep -B3 EFSCORRUPTED fs/xfs/*.[ch] fs/xfs/libxfs/*.[ch] fs/xfs/scrub/*.[ch] | awk ' BEGIN { ignore = 0; lineno = 0; delete lines; } { if ($0 == "--") { if (!ignore) { for (i = 0; i < lineno; i++) { print(lines[i]); } printf("--\n"); } delete lines; lineno = 0; ignore = 0; } else if ($0 ~ /mark_sick/) { ignore = 1; } else if ($0 ~ /if .fa/) { ignore = 1; } else if ($0 ~ /failaddr/) { ignore = 1; } else if ($0 ~ /_verifier_error/) { ignore = 1; } else if ($0 ~ /^ \* .*EFSCORRUPTED/) { ignore = 1; } else if ($0 ~ /== -EFSCORRUPTED/) { ignore = 1; } else if ($0 ~ /!= -EFSCORRUPTED/) { ignore = 1; } else { lines[lineno++] = $0; } } ' | $filter Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 036f4ee43fd3..d23800c45292 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -217,6 +217,7 @@ xfs_initialize_perag_data( */ if (fdblocks > sbp->sb_dblocks || ifree > ialloc) { xfs_alert(mp, "AGF corruption. Please run xfs_repair."); + xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); error = -EFSCORRUPTED; goto out; } -- cgit v1.2.3 From de6077ec4198b9313c6e09e4c6acbe9179d057c1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:31:03 -0800 Subject: xfs: report ag header corruption errors to the health tracking system Whenever we encounter a corrupt AG header, we should report that to the health monitoring system for later reporting. Buffer readers that don't respond to corruption events with a _mark_sick call can be detected with the following script: #!/bin/bash # Detect missing calls to xfs_*_mark_sick filter=cat tty -s && filter=less git grep -A10 -E '( = xfs_trans_read_buf| = xfs_buf_read\()' fs/xfs/*.[ch] fs/xfs/libxfs/*.[ch] | awk ' BEGIN { ignore = 0; lineno = 0; delete lines; } { if ($0 == "--") { if (!ignore) { for (i = 0; i < lineno; i++) { print(lines[i]); } printf("--\n"); } delete lines; lineno = 0; ignore = 0; } else if ($0 ~ /mark_sick/) { ignore = 1; } else { lines[lineno++] = $0; } } ' | $filter Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc.c | 6 ++++++ fs/xfs/libxfs/xfs_health.h | 13 ++++++++++--- fs/xfs/libxfs/xfs_ialloc.c | 3 +++ fs/xfs/libxfs/xfs_sb.c | 2 ++ fs/xfs/xfs_health.c | 17 +++++++++++++++++ fs/xfs/xfs_inode.c | 14 ++++++++++++-- 6 files changed, 50 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3bd0a33fee0a..1cb2569c43cf 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -26,6 +26,7 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_bmap.h" +#include "xfs_health.h" struct kmem_cache *xfs_extfree_item_cache; @@ -755,6 +756,8 @@ xfs_alloc_read_agfl( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); if (error) return error; xfs_buf_set_ref(bp, XFS_AGFL_REF); @@ -776,6 +779,7 @@ xfs_alloc_update_counters( if (unlikely(be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))) { xfs_buf_mark_corrupt(agbp); + xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF); return -EFSCORRUPTED; } @@ -3268,6 +3272,8 @@ xfs_read_agf( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index bec7adf9fcf7..fb3f2b49087d 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -26,9 +26,11 @@ * and the "sick" field tells us if that piece was found to need repairs. * Therefore we can conclude that for a given sick flag value: * - * - checked && sick => metadata needs repair - * - checked && !sick => metadata is ok - * - !checked => has not been examined since mount + * - checked && sick => metadata needs repair + * - checked && !sick => metadata is ok + * - !checked && sick => errors have been observed during normal operation, + * but the metadata has not been checked thoroughly + * - !checked && !sick => has not been examined since mount */ struct xfs_mount; @@ -142,6 +144,8 @@ void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask); void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick, unsigned int *checked); +void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno, + unsigned int mask); void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask); void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask); void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask); @@ -222,4 +226,7 @@ void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo); void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs); +#define xfs_metadata_is_sick(error) \ + (unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC)) + #endif /* __XFS_HEALTH_H__ */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 2361a22035b0..2531b4c08915 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -27,6 +27,7 @@ #include "xfs_log.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_health.h" /* * Lookup a record by ino in the btree given by cur. @@ -2604,6 +2605,8 @@ xfs_read_agi( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); if (error) return error; if (tp) diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5bb6e2bd6dee..d991eec05436 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1290,6 +1290,8 @@ xfs_sb_read_secondary( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB); if (error) return error; xfs_buf_set_ref(bp, XFS_SSB_REF); diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index e727a46a95a4..91a29a8610f6 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -201,6 +201,23 @@ xfs_rt_measure_sickness( spin_unlock(&mp->m_sb_lock); } +/* Mark unhealthy per-ag metadata given a raw AG number. */ +void +xfs_agno_mark_sick( + struct xfs_mount *mp, + xfs_agnumber_t agno, + unsigned int mask) +{ + struct xfs_perag *pag = xfs_perag_get(mp, agno); + + /* per-ag structure not set up yet? */ + if (!pag) + return; + + xfs_ag_mark_sick(pag, mask); + xfs_perag_put(pag); +} + /* Mark unhealthy per-ag metadata. */ void xfs_ag_mark_sick( diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e8845287debd..ab2d891b8d14 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -780,6 +780,8 @@ xfs_init_new_inode( */ if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } @@ -1970,6 +1972,7 @@ xfs_iunlink_update_bucket( */ if (old_value == new_agino) { xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } @@ -2019,11 +2022,14 @@ xfs_iunlink_reload_next( */ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); - if (error) + if (error) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return error; + } /* If this is not an unlinked inode, something is very wrong. */ if (VFS_I(next_ip)->i_nlink != 0) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); error = -EFSCORRUPTED; goto rele; } @@ -2061,6 +2067,7 @@ xfs_iunlink_insert_inode( if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino)) { xfs_buf_mark_corrupt(agibp); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } @@ -2148,6 +2155,7 @@ xfs_iunlink_remove_inode( if (!xfs_verify_agino(pag, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); return -EFSCORRUPTED; } @@ -2176,8 +2184,10 @@ xfs_iunlink_remove_inode( struct xfs_inode *prev_ip; prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); - if (!prev_ip) + if (!prev_ip) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; + } error = xfs_iunlink_log_inode(tp, prev_ip, pag, ip->i_next_unlinked); -- cgit v1.2.3 From 1196f3f5abf736809cafac1696967ac318a44ca0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:31:51 -0800 Subject: xfs: report block map corruption errors to the health tracking system Whenever we encounter a corrupt block mapping, we should report that to the health monitoring system for later reporting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 35 +++++++++++++++++++++++++++++------ fs/xfs/libxfs/xfs_health.h | 1 + fs/xfs/xfs_health.c | 26 ++++++++++++++++++++++++++ fs/xfs/xfs_iomap.c | 15 ++++++++++++--- fs/xfs/xfs_reflink.c | 6 +++++- 5 files changed, 73 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 68ab22a2a453..7e632f13af07 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -36,6 +36,7 @@ #include "xfs_refcount.h" #include "xfs_icache.h" #include "xfs_iomap.h" +#include "xfs_health.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -960,6 +961,7 @@ xfs_bmap_add_attrfork_local( /* should only be called for types that support local format data */ ASSERT(0); + xfs_bmap_mark_sick(ip, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -1143,6 +1145,7 @@ xfs_iread_bmbt_block( (unsigned long long)ip->i_ino); xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block, sizeof(*block), __this_address); + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -1158,6 +1161,7 @@ xfs_iread_bmbt_block( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iread_extents(2)", frp, sizeof(*frp), fa); + xfs_bmap_mark_sick(ip, whichfork); return xfs_bmap_complain_bad_rec(ip, whichfork, fa, &new); } @@ -1213,6 +1217,8 @@ xfs_iread_extents( smp_store_release(&ifp->if_needextents, 0); return 0; out: + if (xfs_metadata_is_sick(error)) + xfs_bmap_mark_sick(ip, whichfork); xfs_iext_destroy(ifp); return error; } @@ -1292,6 +1298,7 @@ xfs_bmap_last_before( break; default: ASSERT(0); + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -3900,12 +3907,16 @@ xfs_bmapi_read( ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE))); xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - if (WARN_ON_ONCE(!ifp)) + if (WARN_ON_ONCE(!ifp)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (xfs_is_shutdown(mp)) return -EIO; @@ -4386,6 +4397,7 @@ xfs_bmapi_write( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -4613,9 +4625,11 @@ xfs_bmapi_convert_delalloc( error = -ENOSPC; if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) goto out_finish; - error = -EFSCORRUPTED; - if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); + error = -EFSCORRUPTED; goto out_finish; + } XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length)); XFS_STATS_INC(mp, xs_xstrat_quick); @@ -4674,6 +4688,7 @@ xfs_bmapi_remap( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5286,8 +5301,10 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); ifp = xfs_ifork_ptr(ip, whichfork); - if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } if (xfs_is_shutdown(mp)) return -EIO; @@ -5757,6 +5774,7 @@ xfs_bmap_collapse_extents( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5872,6 +5890,7 @@ xfs_bmap_insert_extents( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -5975,6 +5994,7 @@ xfs_bmap_split_extent( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; } @@ -6157,8 +6177,10 @@ xfs_bmap_finish_one( bmap->br_startoff, bmap->br_blockcount, bmap->br_state); - if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) + if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) { + xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork); return -EFSCORRUPTED; + } if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) @@ -6176,6 +6198,7 @@ xfs_bmap_finish_one( break; default: ASSERT(0); + xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork); error = -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index fb3f2b49087d..3c8fd060744f 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -159,6 +159,7 @@ void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, unsigned int *checked); void xfs_health_unmount(struct xfs_mount *mp); +void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork); /* Now some helpers. */ diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 91a29a8610f6..349360a708b4 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -481,3 +481,29 @@ xfs_bulkstat_health( bs->bs_sick |= m->ioctl_mask; } } + +/* Mark a block mapping sick. */ +void +xfs_bmap_mark_sick( + struct xfs_inode *ip, + int whichfork) +{ + unsigned int mask; + + switch (whichfork) { + case XFS_DATA_FORK: + mask = XFS_SICK_INO_BMBTD; + break; + case XFS_ATTR_FORK: + mask = XFS_SICK_INO_BMBTA; + break; + case XFS_COW_FORK: + mask = XFS_SICK_INO_BMBTC; + break; + default: + ASSERT(0); + return; + } + + xfs_inode_mark_sick(ip, mask); +} diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 055cdec2e9ad..4087af7f3c9f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -27,6 +27,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_health.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -45,6 +46,7 @@ xfs_alert_fsblock_zero( (unsigned long long)imap->br_startoff, (unsigned long long)imap->br_blockcount, imap->br_state); + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -99,8 +101,10 @@ xfs_bmbt_to_iomap( struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return xfs_alert_fsblock_zero(ip, imap); + } if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; @@ -325,8 +329,10 @@ xfs_iomap_write_direct( goto out_unlock; } - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = xfs_alert_fsblock_zero(ip, imap); + } out_unlock: *seq = xfs_iomap_inode_sequence(ip, 0); @@ -639,8 +645,10 @@ xfs_iomap_write_unwritten( if (error) return error; - if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) + if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return xfs_alert_fsblock_zero(ip, &imap); + } if ((numblks_fsb = imap.br_blockcount) == 0) { /* @@ -986,6 +994,7 @@ xfs_buffered_write_iomap_begin( if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e64ef2a293b6..cd9a00fd16e7 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -29,6 +29,7 @@ #include "xfs_iomap.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_health.h" /* * Copy on Write of Shared Blocks @@ -1227,8 +1228,10 @@ xfs_reflink_remap_extent( * extent if they're both holes or both the same physical extent. */ if (dmap->br_startblock == smap.br_startblock) { - if (dmap->br_state != smap.br_state) + if (dmap->br_state != smap.br_state) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; + } goto out_cancel; } @@ -1391,6 +1394,7 @@ xfs_reflink_remap_blocks( ASSERT(nimaps == 1 && imap.br_startoff == srcoff); if (imap.br_startblock == DELAYSTARTBLOCK) { ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + xfs_bmap_mark_sick(src, XFS_DATA_FORK); error = -EFSCORRUPTED; break; } -- cgit v1.2.3 From a78d10f45b23149f1b23019a4f4fb57dcf852e39 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:32:09 -0800 Subject: xfs: report btree block corruption errors to the health system Whenever we encounter corrupt btree blocks, we should report that to the health monitoring system for later reporting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc.c | 2 ++ fs/xfs/libxfs/xfs_bmap.c | 6 ++++++ fs/xfs/libxfs/xfs_btree.c | 25 ++++++++++++++++++++++--- fs/xfs/libxfs/xfs_health.h | 2 ++ fs/xfs/libxfs/xfs_ialloc.c | 1 + fs/xfs/libxfs/xfs_refcount.c | 6 +++++- fs/xfs/libxfs/xfs_rmap.c | 6 +++++- fs/xfs/xfs_health.c | 38 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 81 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 1cb2569c43cf..2464b64b1cb4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -275,6 +275,7 @@ xfs_alloc_complain_bad_rec( xfs_warn(mp, "start block 0x%x block count 0x%x", irec->ar_startblock, irec->ar_blockcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -2702,6 +2703,7 @@ xfs_exact_minlen_extent_available( goto out; if (*stat == 0) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto out; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 7e632f13af07..78d2d3393a86 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -368,6 +368,8 @@ xfs_bmap_check_leaf_extents( error = xfs_btree_read_bufl(mp, NULL, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) goto error_norelse; } @@ -454,6 +456,8 @@ xfs_bmap_check_leaf_extents( error = xfs_btree_read_bufl(mp, NULL, bno, &bp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) goto error_norelse; } @@ -568,6 +572,8 @@ xfs_bmap_btree_to_extents( #endif error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) return error; cblock = XFS_BUF_TO_BLOCK(cbp); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1adfc35c99c9..a4784e56b589 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -27,6 +27,7 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_health.h" /* * Btree magic numbers. @@ -177,6 +178,7 @@ xfs_btree_check_lblock( XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; @@ -243,6 +245,7 @@ xfs_btree_check_sblock( XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { if (bp) trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } return 0; @@ -318,6 +321,7 @@ xfs_btree_check_ptr( level, index); } + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -498,6 +502,8 @@ xfs_btree_dup_cursor( xfs_buf_daddr(bp), mp->m_bsize, 0, &bp, cur->bc_ops->buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(new); if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; @@ -1351,6 +1357,8 @@ xfs_btree_read_buf_block( error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, flags, bpp, cur->bc_ops->buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_btree_mark_sick(cur); if (error) return error; @@ -1661,6 +1669,7 @@ xfs_btree_increment( if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) goto out0; ASSERT(0); + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1754,6 +1763,7 @@ xfs_btree_decrement( if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) goto out0; ASSERT(0); + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1846,6 +1856,7 @@ out_bad: *blkp = NULL; xfs_buf_mark_corrupt(bp); xfs_trans_brelse(cur->bc_tp, bp); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -1892,8 +1903,10 @@ xfs_btree_lookup( XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ - if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) + if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } block = NULL; keyno = 0; @@ -1936,6 +1949,7 @@ xfs_btree_lookup( XFS_ERRLEVEL_LOW, cur->bc_mp, block, sizeof(*block)); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -4369,12 +4383,16 @@ xfs_btree_visit_block( */ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))) + xfs_buf_daddr(bp))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } else { if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, - xfs_buf_daddr(bp))) + xfs_buf_daddr(bp))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -5233,6 +5251,7 @@ xfs_btree_goto_left_edge( return error; if (stat != 0) { ASSERT(0); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 3c8fd060744f..8f566a78737f 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -37,6 +37,7 @@ struct xfs_mount; struct xfs_perag; struct xfs_inode; struct xfs_fsop_geom; +struct xfs_btree_cur; /* Observable health issues for metadata spanning the entire filesystem. */ #define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ @@ -160,6 +161,7 @@ void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, void xfs_health_unmount(struct xfs_mount *mp); void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork); +void xfs_btree_mark_sick(struct xfs_btree_cur *cur); /* Now some helpers. */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 2531b4c08915..91584e96f05f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -148,6 +148,7 @@ xfs_inobt_complain_bad_rec( "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, irec->ir_free, irec->ir_holemask); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 7df52daa22cf..c7fda1a4950a 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -23,6 +23,7 @@ #include "xfs_refcount.h" #include "xfs_rmap.h" #include "xfs_ag.h" +#include "xfs_health.h" struct kmem_cache *xfs_refcount_intent_cache; @@ -156,6 +157,7 @@ xfs_refcount_complain_bad_rec( xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ -1889,8 +1891,10 @@ xfs_refcount_recover_extent( struct xfs_refcount_recovery *rr; if (XFS_IS_CORRUPT(cur->bc_mp, - be32_to_cpu(rec->refc.rc_refcount) != 1)) + be32_to_cpu(rec->refc.rc_refcount) != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } rr = kmalloc(sizeof(struct xfs_refcount_recovery), GFP_KERNEL | __GFP_NOFAIL); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 0bd1f47b2c2b..8c4d30a0febb 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -23,6 +23,7 @@ #include "xfs_error.h" #include "xfs_inode.h" #include "xfs_ag.h" +#include "xfs_health.h" struct kmem_cache *xfs_rmap_intent_cache; @@ -56,8 +57,10 @@ xfs_rmap_lookup_le( error = xfs_rmap_get_rec(cur, irec, &get_stat); if (error) return error; - if (!get_stat) + if (!get_stat) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } @@ -277,6 +280,7 @@ xfs_rmap_complain_bad_rec( "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, irec->rm_blockcount); + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 349360a708b4..e9338c05ea23 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -14,6 +14,7 @@ #include "xfs_trace.h" #include "xfs_health.h" #include "xfs_ag.h" +#include "xfs_btree.h" /* * Warn about metadata corruption that we detected but haven't fixed, and @@ -507,3 +508,40 @@ xfs_bmap_mark_sick( xfs_inode_mark_sick(ip, mask); } + +/* Record observations of btree corruption with the health tracking system. */ +void +xfs_btree_mark_sick( + struct xfs_btree_cur *cur) +{ + unsigned int mask; + + switch (cur->bc_btnum) { + case XFS_BTNUM_BMAP: + xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork); + return; + case XFS_BTNUM_BNO: + mask = XFS_SICK_AG_BNOBT; + break; + case XFS_BTNUM_CNT: + mask = XFS_SICK_AG_CNTBT; + break; + case XFS_BTNUM_INO: + mask = XFS_SICK_AG_INOBT; + break; + case XFS_BTNUM_FINO: + mask = XFS_SICK_AG_FINOBT; + break; + case XFS_BTNUM_RMAP: + mask = XFS_SICK_AG_RMAPBT; + break; + case XFS_BTNUM_REFC: + mask = XFS_SICK_AG_REFCNTBT; + break; + default: + ASSERT(0); + return; + } + + xfs_ag_mark_sick(cur->bc_ag.pag, mask); +} -- cgit v1.2.3 From ca14c0968c1f693ab4bcb5368c800c33e7a2ad7e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:32:18 -0800 Subject: xfs: report dir/attr block corruption errors to the health system Whenever we encounter corrupt directory or extended attribute blocks, we should report that to the health monitoring system for later reporting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_attr_leaf.c | 4 ++++ fs/xfs/libxfs/xfs_attr_remote.c | 27 ++++++++++++++++----------- fs/xfs/libxfs/xfs_da_btree.c | 37 ++++++++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_dir2.c | 5 ++++- fs/xfs/libxfs/xfs_dir2_block.c | 2 ++ fs/xfs/libxfs/xfs_dir2_data.c | 3 +++ fs/xfs/libxfs/xfs_dir2_leaf.c | 3 +++ fs/xfs/libxfs/xfs_dir2_node.c | 7 +++++++ fs/xfs/libxfs/xfs_health.h | 3 +++ fs/xfs/xfs_attr_inactive.c | 4 ++++ fs/xfs/xfs_attr_list.c | 9 ++++++++- fs/xfs/xfs_health.c | 39 +++++++++++++++++++++++++++++++++++++++ 12 files changed, 125 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 192d9938a231..ac904cc1a97b 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -29,6 +29,7 @@ #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_errortag.h" +#include "xfs_health.h" /* @@ -2343,6 +2344,7 @@ xfs_attr3_leaf_lookup_int( entries = xfs_attr3_leaf_entryp(leaf); if (ichdr.count >= args->geo->blksize / 8) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -2362,10 +2364,12 @@ xfs_attr3_leaf_lookup_int( } if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 1c007ebf153a..43d66fbbdf6f 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -22,6 +22,7 @@ #include "xfs_attr_remote.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_health.h" #define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ @@ -276,17 +277,18 @@ xfs_attr3_rmt_hdr_set( */ STATIC int xfs_attr_rmtval_copyout( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_ino_t ino, - int *offset, - int *valuelen, - uint8_t **dst) + struct xfs_mount *mp, + struct xfs_buf *bp, + struct xfs_inode *dp, + int *offset, + int *valuelen, + uint8_t **dst) { - char *src = bp->b_addr; - xfs_daddr_t bno = xfs_buf_daddr(bp); - int len = BBTOB(bp->b_length); - int blksize = mp->m_attr_geo->blksize; + char *src = bp->b_addr; + xfs_ino_t ino = dp->i_ino; + xfs_daddr_t bno = xfs_buf_daddr(bp); + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; ASSERT(len >= blksize); @@ -302,6 +304,7 @@ xfs_attr_rmtval_copyout( xfs_alert(mp, "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", bno, *offset, byte_cnt, ino); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } hdr_size = sizeof(struct xfs_attr3_rmt_hdr); @@ -418,10 +421,12 @@ xfs_attr_rmtval_get( dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK); if (error) return error; - error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + error = xfs_attr_rmtval_copyout(mp, bp, args->dp, &offset, &valuelen, &dst); xfs_buf_relse(bp); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 444ec1560f43..718d071bb21a 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -23,6 +23,7 @@ #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_errortag.h" +#include "xfs_health.h" /* * xfs_da_btree.c @@ -353,6 +354,8 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = { static int xfs_da3_node_set_type( struct xfs_trans *tp, + struct xfs_inode *dp, + int whichfork, struct xfs_buf *bp) { struct xfs_da_blkinfo *info = bp->b_addr; @@ -374,6 +377,7 @@ xfs_da3_node_set_type( XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp, info, sizeof(*info)); xfs_trans_brelse(tp, bp); + xfs_dirattr_mark_sick(dp, whichfork); return -EFSCORRUPTED; } } @@ -392,7 +396,7 @@ xfs_da3_node_read( &xfs_da3_node_buf_ops); if (error || !*bpp || !tp) return error; - return xfs_da3_node_set_type(tp, *bpp); + return xfs_da3_node_set_type(tp, dp, whichfork, *bpp); } int @@ -409,6 +413,8 @@ xfs_da3_node_read_mapped( error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno, XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0, bpp, &xfs_da3_node_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, whichfork); if (error || !*bpp) return error; @@ -419,7 +425,7 @@ xfs_da3_node_read_mapped( if (!tp) return 0; - return xfs_da3_node_set_type(tp, *bpp); + return xfs_da3_node_set_type(tp, dp, whichfork, *bpp); } /* @@ -632,6 +638,7 @@ xfs_da3_split( if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) { xfs_buf_mark_corrupt(oldblk->bp); + xfs_da_mark_sick(state->args); error = -EFSCORRUPTED; goto out; } @@ -645,6 +652,7 @@ xfs_da3_split( if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) { xfs_buf_mark_corrupt(oldblk->bp); + xfs_da_mark_sick(state->args); error = -EFSCORRUPTED; goto out; } @@ -1636,6 +1644,7 @@ xfs_da3_node_lookup_int( if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1651,6 +1660,7 @@ xfs_da3_node_lookup_int( /* Tree taller than we can handle; bail out! */ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1659,6 +1669,7 @@ xfs_da3_node_lookup_int( expected_level = nodehdr.level - 1; else if (expected_level != nodehdr.level) { xfs_buf_mark_corrupt(blk->bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } else expected_level--; @@ -1710,12 +1721,16 @@ xfs_da3_node_lookup_int( } /* We can't point back to the root. */ - if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) + if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } } - if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) + if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } /* * A leaf block that ends in the hashval that we are interested in @@ -1733,6 +1748,7 @@ xfs_da3_node_lookup_int( args->blkno = blk->blkno; } else { ASSERT(0); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } if (((retval == -ENOENT) || (retval == -ENOATTR)) && @@ -2299,8 +2315,10 @@ xfs_da3_swap_lastblock( error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; - if (XFS_IS_CORRUPT(mp, lastoff == 0)) + if (XFS_IS_CORRUPT(mp, lastoff == 0)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } /* * Read the last block in the btree space. */ @@ -2350,6 +2368,7 @@ xfs_da3_swap_lastblock( if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->forw) != last_blkno || sib_info->magic != dead_info->magic)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2370,6 +2389,7 @@ xfs_da3_swap_lastblock( if (XFS_IS_CORRUPT(mp, be32_to_cpu(sib_info->back) != last_blkno || sib_info->magic != dead_info->magic)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2392,6 +2412,7 @@ xfs_da3_swap_lastblock( xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, level >= 0 && level != par_hdr.level + 1)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2403,6 +2424,7 @@ xfs_da3_swap_lastblock( entno++) continue; if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2428,6 +2450,7 @@ xfs_da3_swap_lastblock( xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (XFS_IS_CORRUPT(mp, par_blkno == 0)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2437,6 +2460,7 @@ xfs_da3_swap_lastblock( par_node = par_buf->b_addr; xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node); if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) { + xfs_da_mark_sick(args); error = -EFSCORRUPTED; goto done; } @@ -2567,6 +2591,7 @@ out_free_irecs: invalid_mapping: /* Caller ok with no mapping. */ if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) { + xfs_dirattr_mark_sick(dp, whichfork); error = -EFSCORRUPTED; if (xfs_error_level >= XFS_ERRLEVEL_LOW) { xfs_alert(mp, "%s: bno %u inode %llu", @@ -2648,6 +2673,8 @@ xfs_da_read_buf( error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0, &bp, ops); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, whichfork); if (error) goto out_free; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 9018abb38d7f..4821519efad4 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -18,6 +18,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trace.h" +#include "xfs_health.h" const struct xfs_name xfs_name_dotdot = { .name = (const unsigned char *)"..", @@ -626,8 +627,10 @@ xfs_dir2_isblock( return 0; *isblock = true; - if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) + if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) { + xfs_da_mark_sick(args); return -EFSCORRUPTED; + } return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index fde46081a824..a2da007adb46 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -20,6 +20,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_health.h" /* * Local function prototypes. @@ -152,6 +153,7 @@ xfs_dir3_block_read( __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index dbcf58979a59..7a6d965bea71 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_health.h" static xfs_failaddr_t xfs_dir2_data_freefind_verify( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, @@ -433,6 +434,7 @@ xfs_dir3_data_read( __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -1198,6 +1200,7 @@ xfs_dir2_data_use_free( corrupt: xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount, hdr, sizeof(*hdr), __FILE__, __LINE__, fa); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index cb9e950a911d..08dda5ce9d91 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -19,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_health.h" /* * Local function declarations. @@ -1393,8 +1394,10 @@ xfs_dir2_leaf_removename( bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[db]) != oldbest) { xfs_buf_mark_corrupt(lbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } + /* * Mark the former data entry unused. */ diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 7a03aeb9f4c9..be0b8834028c 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -20,6 +20,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_health.h" /* * Function declarations. @@ -231,6 +232,7 @@ __xfs_dir3_free_read( __xfs_buf_mark_corrupt(*bpp, fa); xfs_trans_brelse(tp, *bpp); *bpp = NULL; + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; } @@ -443,6 +445,7 @@ xfs_dir2_leaf_to_node( if (be32_to_cpu(ltp->bestcount) > (uint)dp->i_disk_size / args->geo->blksize) { xfs_buf_mark_corrupt(lbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -517,6 +520,7 @@ xfs_dir2_leafn_add( */ if (index < 0) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -736,6 +740,7 @@ xfs_dir2_leafn_lookup_for_addname( cpu_to_be16(NULLDATAOFF))) { if (curfdb != newfdb) xfs_trans_brelse(tp, curbp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } curfdb = newfdb; @@ -804,6 +809,7 @@ xfs_dir2_leafn_lookup_for_entry( xfs_dir3_leaf_check(dp, bp); if (leafhdr.count <= 0) { xfs_buf_mark_corrupt(bp); + xfs_da_mark_sick(args); return -EFSCORRUPTED; } @@ -1739,6 +1745,7 @@ xfs_dir2_node_add_datablk( } else { xfs_alert(mp, " ... fblk is NULL"); } + xfs_da_mark_sick(args); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 8f566a78737f..ff98c03212b8 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -38,6 +38,7 @@ struct xfs_perag; struct xfs_inode; struct xfs_fsop_geom; struct xfs_btree_cur; +struct xfs_da_args; /* Observable health issues for metadata spanning the entire filesystem. */ #define XFS_SICK_FS_COUNTERS (1 << 0) /* summary counters */ @@ -162,6 +163,8 @@ void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick, void xfs_health_unmount(struct xfs_mount *mp); void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork); void xfs_btree_mark_sick(struct xfs_btree_cur *cur); +void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork); +void xfs_da_mark_sick(struct xfs_da_args *args); /* Now some helpers. */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 89c7a9f4f930..24fb12986a56 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -23,6 +23,7 @@ #include "xfs_quota.h" #include "xfs_dir2.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Invalidate any incore buffers associated with this remote attribute value @@ -147,6 +148,7 @@ xfs_attr3_node_inactive( if (level > XFS_DA_NODE_MAXDEPTH) { xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -197,6 +199,7 @@ xfs_attr3_node_inactive( default: xfs_buf_mark_corrupt(child_bp); xfs_trans_brelse(*trans, child_bp); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; break; } @@ -286,6 +289,7 @@ xfs_attr3_root_inactive( error = xfs_attr3_leaf_inactive(trans, dp, bp); break; default: + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; xfs_buf_mark_corrupt(bp); xfs_trans_brelse(*trans, bp); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 56021403f447..0ca4dc62f814 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -22,6 +22,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_dir2.h" +#include "xfs_health.h" STATIC int xfs_attr_shortform_compare(const void *a, const void *b) @@ -125,6 +126,7 @@ xfs_attr_shortform_list( context->dp->i_mount, sfe, sizeof(*sfe)); kfree(sbuf); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -262,8 +264,10 @@ xfs_attr_node_list_lookup( return 0; /* We can't point back to the root. */ - if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) + if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) { + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } } if (expected_level != 0) @@ -275,6 +279,7 @@ xfs_attr_node_list_lookup( out_corruptbuf: xfs_buf_mark_corrupt(bp); xfs_trans_brelse(tp, bp); + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); return -EFSCORRUPTED; } @@ -304,6 +309,8 @@ xfs_attr_node_list( if (cursor->blkno > 0) { error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp, XFS_ATTR_FORK); + if (xfs_metadata_is_sick(error)) + xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK); if ((error != 0) && (error != -EFSCORRUPTED)) return error; if (bp) { diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index e9338c05ea23..568300ae8555 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -15,6 +15,8 @@ #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_btree.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" /* * Warn about metadata corruption that we detected but haven't fixed, and @@ -545,3 +547,40 @@ xfs_btree_mark_sick( xfs_ag_mark_sick(cur->bc_ag.pag, mask); } + +/* + * Record observations of dir/attr btree corruption with the health tracking + * system. + */ +void +xfs_dirattr_mark_sick( + struct xfs_inode *ip, + int whichfork) +{ + unsigned int mask; + + switch (whichfork) { + case XFS_DATA_FORK: + mask = XFS_SICK_INO_DIR; + break; + case XFS_ATTR_FORK: + mask = XFS_SICK_INO_XATTR; + break; + default: + ASSERT(0); + return; + } + + xfs_inode_mark_sick(ip, mask); +} + +/* + * Record observations of dir/attr btree corruption with the health tracking + * system. + */ +void +xfs_da_mark_sick( + struct xfs_da_args *args) +{ + xfs_dirattr_mark_sick(args->dp, args->whichfork); +} -- cgit v1.2.3 From b280fb0cbf48cea962d90bbe9c080ee1e77c3b4c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:32:42 -0800 Subject: xfs: report symlink block corruption errors to the health system Whenever we encounter corrupt symbolic link blocks, we should report that to the health monitoring system for later reporting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_symlink.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index e73692fbe179..fb8c57b9d13d 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -58,6 +58,8 @@ xfs_readlink_bmap_ilocked( error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, &bp, &xfs_symlink_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); if (error) return error; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); @@ -68,6 +70,7 @@ xfs_readlink_bmap_ilocked( if (xfs_has_crc(mp)) { if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); error = -EFSCORRUPTED; xfs_alert(mp, "symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", @@ -103,7 +106,7 @@ xfs_readlink( { struct xfs_mount *mp = ip->i_mount; xfs_fsize_t pathlen; - int error = -EFSCORRUPTED; + int error; trace_xfs_readlink(ip); @@ -116,14 +119,14 @@ xfs_readlink( pathlen = ip->i_disk_size; if (!pathlen) - goto out; + goto out_corrupt; if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) { xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", __func__, (unsigned long long) ip->i_ino, (long long) pathlen); ASSERT(0); - goto out; + goto out_corrupt; } if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { @@ -132,7 +135,7 @@ xfs_readlink( * if if_data is junk. */ if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data)) - goto out; + goto out_corrupt; memcpy(link, ip->i_df.if_data, pathlen + 1); error = 0; @@ -140,9 +143,12 @@ xfs_readlink( error = xfs_readlink_bmap_ilocked(ip, link); } - out: xfs_iunlock(ip, XFS_ILOCK_SHARED); return error; + out_corrupt: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + return -EFSCORRUPTED; } int @@ -497,6 +503,7 @@ xfs_inactive_symlink( __func__, (unsigned long long)ip->i_ino, pathlen); xfs_iunlock(ip, XFS_ILOCK_EXCL); ASSERT(0); + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); return -EFSCORRUPTED; } -- cgit v1.2.3 From baf44fa5c37a2357a7ae92889f74bc1824f33fd4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:32:43 -0800 Subject: xfs: report inode corruption errors to the health system Whenever we encounter corrupt inode records, we should report that to the health monitoring system for later reporting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ialloc.c | 1 + fs/xfs/libxfs/xfs_inode_buf.c | 12 +++++++++--- fs/xfs/libxfs/xfs_inode_fork.c | 8 ++++++++ fs/xfs/xfs_icache.c | 9 +++++++++ fs/xfs/xfs_inode.c | 2 ++ 5 files changed, 29 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 91584e96f05f..56f82b8af07e 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2999,6 +2999,7 @@ xfs_ialloc_check_shrink( goto out; if (!has) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT); error = -EFSCORRUPTED; goto out; } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 137a65bda95d..1280d6acd1c1 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -18,6 +18,7 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_dir2.h" +#include "xfs_health.h" #include @@ -132,9 +133,14 @@ xfs_imap_to_bp( struct xfs_imap *imap, struct xfs_buf **bpp) { - return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, - &xfs_inode_buf_ops); + int error; + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), + XFS_SICK_AG_INOBT); + return error; } static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 60758f75182b..83194edcb0ba 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -25,6 +25,7 @@ #include "xfs_attr_leaf.h" #include "xfs_types.h" #include "xfs_errortag.h" +#include "xfs_health.h" struct kmem_cache *xfs_ifork_cache; @@ -88,6 +89,7 @@ xfs_iformat_local( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_local", dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } @@ -125,6 +127,7 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(1)", dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } @@ -144,6 +147,7 @@ xfs_iformat_extents( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_extents(2)", dp, sizeof(*dp), fa); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return xfs_bmap_complain_bad_rec(ip, whichfork, fa, &new); } @@ -202,6 +206,7 @@ xfs_iformat_btree( xfs_inode_verifier_error(ip, -EFSCORRUPTED, "xfs_iformat_btree", dfp, size, __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } @@ -267,12 +272,14 @@ xfs_iformat_data_fork( default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } break; default: xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return -EFSCORRUPTED; } } @@ -344,6 +351,7 @@ xfs_iformat_attr_fork( default: xfs_inode_verifier_error(ip, error, __func__, dip, sizeof(*dip), __this_address); + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); error = -EFSCORRUPTED; break; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 06046827b5fe..e64265bc0b33 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -24,6 +24,7 @@ #include "xfs_ialloc.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" #include @@ -415,6 +416,9 @@ xfs_iget_check_free_state( xfs_warn(ip->i_mount, "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", ip->i_ino, VFS_I(ip)->i_mode); + xfs_agno_mark_sick(ip->i_mount, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } @@ -422,6 +426,9 @@ xfs_iget_check_free_state( xfs_warn(ip->i_mount, "Corruption detected! Free inode 0x%llx has blocks allocated!", ip->i_ino); + xfs_agno_mark_sick(ip->i_mount, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_SICK_AG_INOBT); return -EFSCORRUPTED; } return 0; @@ -640,6 +647,8 @@ xfs_iget_cache_miss( xfs_buf_offset(bp, ip->i_imap.im_boffset)); if (!error) xfs_buf_set_ref(bp, XFS_INO_REF); + else + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); xfs_trans_brelse(tp, bp); if (error) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ab2d891b8d14..5d0ac5c733f3 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3422,6 +3422,8 @@ flush_out: /* generate the checksum. */ xfs_dinode_calc_crc(mp, dip); + if (error) + xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); return error; } -- cgit v1.2.3 From 841a5f87e2d08c60836859da69575b36caadb37d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:32:44 -0800 Subject: xfs: report quota block corruption errors to the health system Whenever we encounter corrupt quota blocks, we should report that to the health monitoring system for later reporting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dquot.c | 30 ++++++++++++++++++++++++++++++ fs/xfs/xfs_health.c | 1 + fs/xfs/xfs_qm.c | 8 ++++++-- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 308b5f33b712..30d36596a2e4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -24,6 +24,7 @@ #include "xfs_log.h" #include "xfs_bmap_btree.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Lock order: @@ -44,6 +45,29 @@ static struct kmem_cache *xfs_dquot_cache; static struct lock_class_key xfs_dquot_group_class; static struct lock_class_key xfs_dquot_project_class; +/* Record observations of quota corruption with the health tracking system. */ +static void +xfs_dquot_mark_sick( + struct xfs_dquot *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + + switch (dqp->q_type) { + case XFS_DQTYPE_USER: + xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA); + break; + case XFS_DQTYPE_GROUP: + xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA); + break; + case XFS_DQTYPE_PROJ: + xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA); + break; + default: + ASSERT(0); + break; + } +} + /* * This is called to free all the memory associated with a dquot */ @@ -451,6 +475,8 @@ xfs_dquot_disk_read( error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_dquot_mark_sick(dqp); if (error) { ASSERT(bp == NULL); return error; @@ -574,6 +600,7 @@ xfs_dquot_from_disk( "Metadata corruption detected at %pS, quota %u", __this_address, dqp->q_id); xfs_alert(bp->b_mount, "Unmount and run xfs_repair"); + xfs_dquot_mark_sick(dqp); return -EFSCORRUPTED; } @@ -1238,6 +1265,8 @@ xfs_qm_dqflush( &bp, &xfs_dquot_buf_ops); if (error == -EAGAIN) goto out_unlock; + if (xfs_metadata_is_sick(error)) + xfs_dquot_mark_sick(dqp); if (error) goto out_abort; @@ -1246,6 +1275,7 @@ xfs_qm_dqflush( xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", dqp->q_id, fa); xfs_buf_relse(bp); + xfs_dquot_mark_sick(dqp); error = -EFSCORRUPTED; goto out_abort; } diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 568300ae8555..bbdc621e5ee5 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -17,6 +17,7 @@ #include "xfs_btree.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_quota_defs.h" /* * Warn about metadata corruption that we detected but haven't fixed, and diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 991be1eacb6c..0f4cf4170c35 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -763,14 +763,18 @@ xfs_qm_qino_alloc( (mp->m_sb.sb_gquotino != NULLFSINO)) { ino = mp->m_sb.sb_gquotino; if (XFS_IS_CORRUPT(mp, - mp->m_sb.sb_pquotino != NULLFSINO)) + mp->m_sb.sb_pquotino != NULLFSINO)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA); return -EFSCORRUPTED; + } } else if ((flags & XFS_QMOPT_GQUOTA) && (mp->m_sb.sb_pquotino != NULLFSINO)) { ino = mp->m_sb.sb_pquotino; if (XFS_IS_CORRUPT(mp, - mp->m_sb.sb_gquotino != NULLFSINO)) + mp->m_sb.sb_gquotino != NULLFSINO)) { + xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA); return -EFSCORRUPTED; + } } if (ino != NULLFSINO) { error = xfs_iget(mp, NULL, ino, 0, 0, ipp); -- cgit v1.2.3 From 8368ad49aaf771a6283840140149440b958b20fb Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:32:44 -0800 Subject: xfs: report realtime metadata corruption errors to the health system Whenever we encounter corrupt realtime metadat blocks, we should report that to the health monitoring system for later reporting. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.c | 9 ++++++++- fs/xfs/xfs_rtalloc.c | 6 ++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 1f6e05fc359b..f246d6dbf4ec 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -17,6 +17,7 @@ #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_rtbitmap.h" +#include "xfs_health.h" /* * Realtime allocator bitmap functions shared with userspace. @@ -115,13 +116,19 @@ xfs_rtbuf_get( if (error) return error; - if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) + if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) { + xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : + XFS_SICK_RT_BITMAP); return -EFSCORRUPTED; + } ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp, &xfs_rtbuf_ops); + if (xfs_metadata_is_sick(error)) + xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY : + XFS_SICK_RT_BITMAP); if (error) return error; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b62b5c34413e..e66f9bd5de5c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -22,6 +22,8 @@ #include "xfs_sb.h" #include "xfs_rtbitmap.h" #include "xfs_quota.h" +#include "xfs_log_priv.h" +#include "xfs_health.h" /* * Return whether there are any free extents in the size range given @@ -1202,6 +1204,8 @@ xfs_rtmount_inodes( sbp = &mp->m_sb; error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); + if (xfs_metadata_is_sick(error)) + xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP); if (error) return error; ASSERT(mp->m_rbmip != NULL); @@ -1211,6 +1215,8 @@ xfs_rtmount_inodes( goto out_rele_bitmap; error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); + if (xfs_metadata_is_sick(error)) + xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY); if (error) goto out_rele_bitmap; ASSERT(mp->m_rsumip != NULL); -- cgit v1.2.3 From 989d5ec3175be7c0012d7744c667ae6a266fab06 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:32:55 -0800 Subject: xfs: report XFS_IS_CORRUPT errors to the health system Whenever we encounter XFS_IS_CORRUPT failures, we should report that to the health monitoring system for later reporting. I started with this semantic patch and massaged everything until it built: @@ expression mp, test; @@ - if (XFS_IS_CORRUPT(mp, test)) return -EFSCORRUPTED; + if (XFS_IS_CORRUPT(mp, test)) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } @@ expression mp, test; identifier label, error; @@ - if (XFS_IS_CORRUPT(mp, test)) { error = -EFSCORRUPTED; goto label; } + if (XFS_IS_CORRUPT(mp, test)) { xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto label; } Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 4 +- fs/xfs/libxfs/xfs_alloc.c | 97 ++++++++++++++++++++++++++++++++++------- fs/xfs/libxfs/xfs_attr_remote.c | 8 +++- fs/xfs/libxfs/xfs_bmap.c | 94 ++++++++++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_btree.c | 14 +++++- fs/xfs/libxfs/xfs_ialloc.c | 52 ++++++++++++++++++---- fs/xfs/libxfs/xfs_refcount.c | 37 +++++++++++++++- fs/xfs/libxfs/xfs_rmap.c | 77 ++++++++++++++++++++++++++++++-- fs/xfs/scrub/refcount_repair.c | 9 +++- fs/xfs/xfs_attr_list.c | 9 +++- fs/xfs/xfs_dir2_readdir.c | 6 ++- fs/xfs/xfs_discard.c | 2 + fs/xfs/xfs_iwalk.c | 5 ++- 13 files changed, 364 insertions(+), 50 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index d23800c45292..fadd00011237 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -951,8 +951,10 @@ xfs_ag_shrink_space( agf = agfbp->b_addr; aglen = be32_to_cpu(agi->agi_length); /* some extra paranoid checks before we shrink the ag */ - if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) + if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); return -EFSCORRUPTED; + } if (delta >= aglen) return -EINVAL; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 2464b64b1cb4..ac31b62e7017 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -499,14 +499,18 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, i != 1 || nfbno1 != fbno || - nflen1 != flen)) + nflen1 != flen)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } /* * Look up the record in the by-block tree if necessary. @@ -518,14 +522,18 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, i != 1 || nfbno1 != fbno || - nflen1 != flen)) + nflen1 != flen)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } #ifdef DEBUG @@ -538,8 +546,10 @@ xfs_alloc_fixup_trees( if (XFS_IS_CORRUPT(mp, bnoblock->bb_numrecs != - cntblock->bb_numrecs)) + cntblock->bb_numrecs)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } #endif @@ -569,30 +579,40 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); return -EFSCORRUPTED; + } } /* * Fix up the by-block btree entry(s). @@ -603,8 +623,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } else { /* * Update the by-block entry to start later|be shorter. @@ -618,12 +640,16 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 0)) + if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } if ((error = xfs_btree_insert(bno_cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); return -EFSCORRUPTED; + } } return 0; } @@ -896,8 +922,10 @@ xfs_alloc_cur_check( error = xfs_alloc_get_rec(cur, &bno, &len, &i); if (error) return error; - if (XFS_IS_CORRUPT(args->mp, i != 1)) + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* * Check minlen and deactivate a cntbt cursor if out of acceptable size @@ -1103,6 +1131,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(ccur); error = -EFSCORRUPTED; goto error; } @@ -1137,6 +1166,7 @@ xfs_alloc_ag_vextent_small( *fbnop = args->agbno = fbno; *flenp = args->len = 1; if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { + xfs_btree_mark_sick(ccur); error = -EFSCORRUPTED; goto error; } @@ -1223,6 +1253,7 @@ xfs_alloc_ag_vextent_exact( if (error) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1502,8 +1533,10 @@ xfs_alloc_ag_vextent_lastblock( error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); if (error) return error; - if (XFS_IS_CORRUPT(args->mp, i != 1)) + if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(acur->cnt); return -EFSCORRUPTED; + } if (*len >= args->minlen) break; error = xfs_btree_increment(acur->cnt, 0, &i); @@ -1715,6 +1748,7 @@ restart: if (error) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1761,6 +1795,7 @@ restart: rlen != 0 && (rlen > flen || rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1783,6 +1818,7 @@ restart: &i))) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1795,6 +1831,7 @@ restart: rlen != 0 && (rlen > flen || rbno + rlen > fbno + flen))) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1811,6 +1848,7 @@ restart: &i))) goto error0; if (XFS_IS_CORRUPT(args->mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1849,6 +1887,7 @@ restart: rlen = args->len; if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -1868,6 +1907,7 @@ restart: if (XFS_IS_CORRUPT(args->mp, args->agbno + args->len > be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto error0; } @@ -1943,6 +1983,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1958,6 +1999,7 @@ xfs_free_ag_extent( * Very bad. */ if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1976,6 +2018,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -1991,6 +2034,7 @@ xfs_free_ag_extent( * Very bad. */ if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2011,12 +2055,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2026,12 +2072,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2041,6 +2089,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2050,6 +2099,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2069,6 +2119,7 @@ xfs_free_ag_extent( i != 1 || xxbno != ltbno || xxlen != ltlen)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2093,12 +2144,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2109,6 +2162,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2128,12 +2182,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -2156,6 +2212,7 @@ xfs_free_ag_extent( if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bno_cur); error = -EFSCORRUPTED; goto error0; } @@ -2168,12 +2225,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cnt_cur); error = -EFSCORRUPTED; goto error0; } @@ -3903,17 +3962,23 @@ __xfs_free_extent( return -EIO; error = xfs_free_extent_fix_freelist(tp, pag, &agbp); - if (error) + if (error) { + if (xfs_metadata_is_sick(error)) + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); return error; + } + agf = agbp->b_addr; if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto err_release; } /* validate the extent size is legal now we have the agf locked */ if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { + xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); error = -EFSCORRUPTED; goto err_release; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 43d66fbbdf6f..ff0412828772 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -553,8 +553,10 @@ xfs_attr_rmtval_stale( xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) || - XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) + XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) { + xfs_bmap_mark_sick(ip, XFS_ATTR_FORK); return -EFSCORRUPTED; + } error = xfs_buf_incore(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map->br_startblock), @@ -664,8 +666,10 @@ xfs_attr_rmtval_invalidate( blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; - if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) + if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) { + xfs_bmap_mark_sick(args->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 78d2d3393a86..314a2144622a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -386,6 +386,7 @@ xfs_bmap_check_leaf_extents( pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -567,8 +568,10 @@ xfs_bmap_btree_to_extents( pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG - if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) + if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } #endif error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); @@ -885,6 +888,7 @@ xfs_bmap_add_attrfork_btree( goto error0; /* must be at least one entry */ if (XFS_IS_CORRUPT(mp, stat != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1211,6 +1215,7 @@ xfs_iread_extents( goto out; if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto out; } @@ -1401,8 +1406,10 @@ xfs_bmap_last_offset( if (ifp->if_format == XFS_DINODE_FMT_LOCAL) return 0; - if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) + if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) { + xfs_bmap_mark_sick(ip, whichfork); return -EFSCORRUPTED; + } error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); if (error || is_empty) @@ -1541,6 +1548,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1548,6 +1556,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1555,6 +1564,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1584,6 +1594,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1617,6 +1628,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1645,6 +1657,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1652,6 +1665,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1686,6 +1700,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1711,6 +1726,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1718,6 +1734,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1762,6 +1779,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1798,6 +1816,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1805,6 +1824,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1884,6 +1904,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -1891,6 +1912,7 @@ xfs_bmap_add_extent_delay_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(bma->cur); error = -EFSCORRUPTED; goto done; } @@ -2087,30 +2109,35 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2139,18 +2166,21 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2182,18 +2212,21 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_delete(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2220,6 +2253,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2253,6 +2287,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2290,6 +2325,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2300,6 +2336,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2330,6 +2367,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2366,6 +2404,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2376,12 +2415,14 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2418,6 +2459,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2430,6 +2472,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2442,6 +2485,7 @@ xfs_bmap_add_extent_unwritten_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2449,6 +2493,7 @@ xfs_bmap_add_extent_unwritten_real( if ((error = xfs_btree_insert(cur, &i))) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2734,6 +2779,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2741,6 +2787,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2748,6 +2795,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2777,6 +2825,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2807,6 +2856,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2833,6 +2883,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2840,6 +2891,7 @@ xfs_bmap_add_extent_hole_real( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -5109,8 +5161,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } if (got.br_startoff == del->br_startoff) @@ -5134,8 +5188,10 @@ xfs_bmap_del_extent_real( } if ((error = xfs_btree_delete(cur, &i))) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } break; case BMAP_LEFT_FILLING: /* @@ -5207,8 +5263,10 @@ xfs_bmap_del_extent_real( error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* * Update the btree record back * to the original value. @@ -5224,8 +5282,10 @@ xfs_bmap_del_extent_real( *logflagsp = 0; return -ENOSPC; } - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } else *logflagsp |= xfs_ilog_fext(whichfork); @@ -5679,21 +5739,27 @@ xfs_bmse_merge( error = xfs_bmbt_lookup_eq(cur, got, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_btree_delete(cur, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_bmbt_update(cur, &new); if (error) @@ -5741,8 +5807,10 @@ xfs_bmap_shift_update_extent( error = xfs_bmbt_lookup_eq(cur, &prev, &i); if (error) return error; - if (XFS_IS_CORRUPT(mp, i != 1)) + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_bmbt_update(cur, got); if (error) @@ -5803,6 +5871,7 @@ xfs_bmap_collapse_extents( goto del_cursor; } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } @@ -5928,11 +5997,13 @@ xfs_bmap_insert_extents( } } if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { + xfs_bmap_mark_sick(ip, whichfork); error = -EFSCORRUPTED; goto del_cursor; } @@ -6032,6 +6103,7 @@ xfs_bmap_split_extent( if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6059,6 +6131,7 @@ xfs_bmap_split_extent( if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } @@ -6066,6 +6139,7 @@ xfs_bmap_split_extent( if (error) goto del_cursor; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto del_cursor; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a4784e56b589..70d406dbb3e4 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2026,8 +2026,10 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } *stat = 1; return 0; } @@ -2480,6 +2482,7 @@ xfs_btree_lshift( goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -2650,6 +2653,7 @@ xfs_btree_rshift( goto error0; i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3538,6 +3542,7 @@ xfs_btree_insert( } if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3945,6 +3950,7 @@ xfs_btree_delrec( */ i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -3953,12 +3959,14 @@ xfs_btree_delrec( if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } i = xfs_btree_lastrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4006,6 +4014,7 @@ xfs_btree_delrec( if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4014,6 +4023,7 @@ xfs_btree_delrec( if (error) goto error0; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4031,6 +4041,7 @@ xfs_btree_delrec( */ i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -4040,6 +4051,7 @@ xfs_btree_delrec( goto error0; i = xfs_btree_firstrec(tcur, level); if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 56f82b8af07e..1ff867075026 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -573,6 +573,7 @@ xfs_inobt_insert_sprec( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -589,10 +590,12 @@ xfs_inobt_insert_sprec( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -602,6 +605,7 @@ xfs_inobt_insert_sprec( * cannot merge, something is seriously wrong. */ if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -951,8 +955,10 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -976,8 +982,10 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } } return 0; @@ -1055,6 +1063,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1063,6 +1072,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, j != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1221,6 +1231,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1230,6 +1241,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1239,6 +1251,7 @@ xfs_dialloc_ag_inobt( if (error) goto error0; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1299,8 +1312,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(lcur); return -EFSCORRUPTED; + } /* * See if we've landed in the parent inode record. The finobt @@ -1324,12 +1339,14 @@ xfs_dialloc_ag_finobt_near( if (error) goto error_rcur; if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } } if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) { + xfs_btree_mark_sick(lcur); error = -EFSCORRUPTED; goto error_rcur; } @@ -1385,8 +1402,10 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } } @@ -1397,14 +1416,18 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return 0; } @@ -1426,14 +1449,18 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) + if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); @@ -1442,8 +1469,10 @@ xfs_dialloc_ag_update_inobt( if (XFS_IS_CORRUPT(cur->bc_mp, rec.ir_free != frec->ir_free || - rec.ir_freecount != frec->ir_freecount)) + rec.ir_freecount != frec->ir_freecount)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } return xfs_inobt_update(cur, &rec); } @@ -1960,6 +1989,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -1970,6 +2000,7 @@ xfs_difree_inobt( goto error0; } if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error0; } @@ -2082,6 +2113,7 @@ xfs_difree_finobt( * something is out of sync. */ if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2108,6 +2140,7 @@ xfs_difree_finobt( if (error) goto error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } @@ -2118,6 +2151,7 @@ xfs_difree_finobt( if (XFS_IS_CORRUPT(mp, rec.ir_free != ibtrec->ir_free || rec.ir_freecount != ibtrec->ir_freecount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto error; } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index c7fda1a4950a..db394125d783 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -240,6 +240,7 @@ xfs_refcount_insert( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -270,12 +271,14 @@ xfs_refcount_delete( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec); error = xfs_btree_delete(cur, i); if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -400,6 +403,7 @@ xfs_refcount_split_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -427,6 +431,7 @@ xfs_refcount_split_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -472,6 +477,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -480,6 +486,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -489,6 +496,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -500,6 +508,7 @@ xfs_refcount_merge_center_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -544,6 +553,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -552,6 +562,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -563,6 +574,7 @@ xfs_refcount_merge_left_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -610,6 +622,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -618,6 +631,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -629,6 +643,7 @@ xfs_refcount_merge_right_extent( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -676,6 +691,7 @@ xfs_refcount_find_left_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -695,6 +711,7 @@ xfs_refcount_find_left_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -769,6 +786,7 @@ xfs_refcount_find_right_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -788,6 +806,7 @@ xfs_refcount_find_right_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1144,6 +1163,7 @@ xfs_refcount_adjust_extents( goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1182,6 +1202,7 @@ xfs_refcount_adjust_extents( */ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) || XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1205,6 +1226,7 @@ xfs_refcount_adjust_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1329,8 +1351,10 @@ xfs_refcount_continue_op( struct xfs_perag *pag = cur->bc_ag.pag; if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno, - ri->ri_blockcount))) + ri->ri_blockcount))) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); @@ -1537,6 +1561,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1554,6 +1579,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1587,6 +1613,7 @@ xfs_refcount_find_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1684,6 +1711,7 @@ xfs_refcount_adjust_cow_extents( goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec && ext.rc_domain != XFS_REFC_DOMAIN_COW)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1699,6 +1727,7 @@ xfs_refcount_adjust_cow_extents( /* Adding a CoW reservation, there should be nothing here. */ if (XFS_IS_CORRUPT(cur->bc_mp, agbno + aglen > ext.rc_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1716,6 +1745,7 @@ xfs_refcount_adjust_cow_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1723,14 +1753,17 @@ xfs_refcount_adjust_cow_extents( case XFS_REFCOUNT_ADJUST_COW_FREE: /* Removing a CoW reservation, there should be one extent. */ if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1742,6 +1775,7 @@ xfs_refcount_adjust_cow_extents( if (error) goto out_error; if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1904,6 +1938,7 @@ xfs_refcount_recover_extent( if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { + xfs_btree_mark_sick(cur); kfree(rr); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 8c4d30a0febb..c9e12533c813 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -135,6 +135,7 @@ xfs_rmap_insert( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -148,6 +149,7 @@ xfs_rmap_insert( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -177,6 +179,7 @@ xfs_rmap_delete( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -185,6 +188,7 @@ xfs_rmap_delete( if (error) goto done; if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) { + xfs_btree_mark_sick(rcur); error = -EFSCORRUPTED; goto done; } @@ -516,7 +520,7 @@ xfs_rmap_lookup_le_range( */ static int xfs_rmap_free_check_owner( - struct xfs_mount *mp, + struct xfs_btree_cur *cur, uint64_t ltoff, struct xfs_rmap_irec *rec, xfs_filblks_t len, @@ -524,6 +528,7 @@ xfs_rmap_free_check_owner( uint64_t offset, unsigned int flags) { + struct xfs_mount *mp = cur->bc_mp; int error = 0; if (owner == XFS_RMAP_OWN_UNKNOWN) @@ -533,12 +538,14 @@ xfs_rmap_free_check_owner( if (XFS_IS_CORRUPT(mp, (flags & XFS_RMAP_UNWRITTEN) != (rec->rm_flags & XFS_RMAP_UNWRITTEN))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } /* Make sure the owner matches what we expect to find in the tree. */ if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } @@ -550,16 +557,19 @@ xfs_rmap_free_check_owner( if (flags & XFS_RMAP_BMBT_BLOCK) { if (XFS_IS_CORRUPT(mp, !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } } else { if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } if (XFS_IS_CORRUPT(mp, offset + len > ltoff + rec->rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } @@ -622,6 +632,7 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -643,6 +654,7 @@ xfs_rmap_unmap( if (XFS_IS_CORRUPT(mp, bno < ltrec.rm_startblock + ltrec.rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -669,6 +681,7 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -681,12 +694,13 @@ xfs_rmap_unmap( ltrec.rm_startblock > bno || ltrec.rm_startblock + ltrec.rm_blockcount < bno + len)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Check owner information. */ - error = xfs_rmap_free_check_owner(mp, ltoff, <rec, len, owner, + error = xfs_rmap_free_check_owner(cur, ltoff, <rec, len, owner, offset, flags); if (error) goto out_error; @@ -701,6 +715,7 @@ xfs_rmap_unmap( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -904,6 +919,7 @@ xfs_rmap_map( if (XFS_IS_CORRUPT(mp, have_lt != 0 && ltrec.rm_startblock + ltrec.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -921,10 +937,12 @@ xfs_rmap_map( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -978,6 +996,7 @@ xfs_rmap_map( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1025,6 +1044,7 @@ xfs_rmap_map( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -1120,6 +1140,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1157,12 +1178,14 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, LEFT.rm_startblock + LEFT.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1185,6 +1208,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1197,10 +1221,12 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1231,6 +1257,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1250,6 +1277,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1261,6 +1289,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1268,6 +1297,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1279,6 +1309,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1286,6 +1317,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1309,6 +1341,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1316,6 +1349,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1335,6 +1369,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1346,6 +1381,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1353,6 +1389,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1423,6 +1460,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1465,6 +1503,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1480,6 +1519,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1513,6 +1553,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1526,6 +1567,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 0)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1538,6 +1580,7 @@ xfs_rmap_convert( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1610,6 +1653,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1638,6 +1682,7 @@ xfs_rmap_convert_shared( if (XFS_IS_CORRUPT(mp, LEFT.rm_startblock + LEFT.rm_blockcount > bno)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1656,10 +1701,12 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1710,6 +1757,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1736,6 +1784,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1762,6 +1811,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1785,6 +1835,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1820,6 +1871,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1865,6 +1917,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1900,6 +1953,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -1938,6 +1992,7 @@ xfs_rmap_convert_shared( if (error) goto done; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto done; } @@ -2027,6 +2082,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2037,12 +2093,14 @@ xfs_rmap_unmap_shared( ltrec.rm_startblock > bno || ltrec.rm_startblock + ltrec.rm_blockcount < bno + len)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Make sure the owner matches what we expect to find in the tree. */ if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2051,16 +2109,19 @@ xfs_rmap_unmap_shared( if (XFS_IS_CORRUPT(mp, (flags & XFS_RMAP_UNWRITTEN) != (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } /* Check the offset. */ if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2117,6 +2178,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2146,6 +2208,7 @@ xfs_rmap_unmap_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2225,6 +2288,7 @@ xfs_rmap_map_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, have_gt != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2277,6 +2341,7 @@ xfs_rmap_map_shared( if (error) goto out_error; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out_error; } @@ -2480,10 +2545,14 @@ xfs_rmap_finish_one( * allocate blocks. */ error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp); - if (error) + if (error) { + xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); return error; - if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) + } + if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) { + xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL); return -EFSCORRUPTED; + } rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag); } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index f38fccc42a20..9c39af03ee1d 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -25,6 +25,7 @@ #include "xfs_refcount_btree.h" #include "xfs_error.h" #include "xfs_ag.h" +#include "xfs_health.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -253,8 +254,10 @@ xrep_refc_walk_rmaps( error = xfs_rmap_get_rec(cur, &rmap, &have_gt); if (error) return error; - if (XFS_IS_CORRUPT(mp, !have_gt)) + if (XFS_IS_CORRUPT(mp, !have_gt)) { + xfs_btree_mark_sick(cur); return -EFSCORRUPTED; + } if (rmap.rm_owner == XFS_RMAP_OWN_COW) { error = xrep_refc_stash_cow(rr, rmap.rm_startblock, @@ -425,8 +428,10 @@ xrep_refc_push_rmaps_at( error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); if (error) return error; - if (XFS_IS_CORRUPT(sc->mp, !have_gt)) + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) { + xfs_btree_mark_sick(sc->sa.rmap_cur); return -EFSCORRUPTED; + } return 0; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 0ca4dc62f814..a6819a642cc0 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -83,8 +83,10 @@ xfs_attr_shortform_list( for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, - sfe->namelen))) + sfe->namelen))) { + xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } context->put_listent(context, sfe->flags, sfe->nameval, @@ -177,6 +179,7 @@ xfs_attr_shortform_list( if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sbp->name, sbp->namelen))) { + xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); error = -EFSCORRUPTED; goto out; } @@ -471,8 +474,10 @@ xfs_attr3_leaf_list_int( } if (XFS_IS_CORRUPT(context->dp->i_mount, - !xfs_attr_namecheck(name, namelen))) + !xfs_attr_namecheck(name, namelen))) { + xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK); return -EFSCORRUPTED; + } context->put_listent(context, entry->flags, name, namelen, valuelen); if (context->seen_enough) diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index e82dd5d65cde..cf9296b7e06f 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -118,8 +118,10 @@ xfs_dir2_sf_getdents( ctx->pos = off & 0x7fffffff; if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(sfep->name, - sfep->namelen))) + sfep->namelen))) { + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); return -EFSCORRUPTED; + } if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, xfs_dir3_get_dtype(mp, filetype))) return 0; @@ -211,6 +213,7 @@ xfs_dir2_block_getdents( if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_rele; } @@ -465,6 +468,7 @@ xfs_dir2_leaf_getdents( if (XFS_IS_CORRUPT(dp->i_mount, !xfs_dir2_namecheck(dep->name, dep->namelen))) { + xfs_dirattr_mark_sick(dp, XFS_DATA_FORK); error = -EFSCORRUPTED; break; } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 299b8f907292..f0bf1cc98548 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -19,6 +19,7 @@ #include "xfs_trace.h" #include "xfs_log.h" #include "xfs_ag.h" +#include "xfs_health.h" /* * Notes on an efficient, low latency fstrim algorithm @@ -210,6 +211,7 @@ xfs_trim_gather_extents( if (error) break; if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; break; } diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index c1e9c7bcb6a9..dc3d83d7dee1 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -297,8 +297,10 @@ xfs_iwalk_ag_start( error = xfs_inobt_get_rec(*curpp, irec, has_more); if (error) return error; - if (XFS_IS_CORRUPT(mp, *has_more != 1)) + if (XFS_IS_CORRUPT(mp, *has_more != 1)) { + xfs_btree_mark_sick(*curpp); return -EFSCORRUPTED; + } iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino + XFS_INODES_PER_CHUNK - 1); @@ -425,6 +427,7 @@ xfs_iwalk_ag( rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); if (iwag->lastino != NULLFSINO && XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { + xfs_btree_mark_sick(cur); error = -EFSCORRUPTED; goto out; } -- cgit v1.2.3 From 4e587917ee1cc28ac3a04cd55937419b9e65d81d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:33:03 -0800 Subject: xfs: add secondary and indirect classes to the health tracking system Establish two more classes of health tracking bits: * Indirect problems, which suggest problems in other health domains that we weren't able to preserve. * Secondary problems, which track state that's related to primary evidence of health problems; and The first class we'll use in an upcoming patch to record in the AG health status the fact that we ran out of memory and had to inactivate an inode with defective metadata. The second class we use to indicate that repair knows that an inode is bad and we need to fix it later. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_health.h | 43 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_health.c | 32 ++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index ff98c03212b8..032d45fcbd51 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -31,6 +31,19 @@ * - !checked && sick => errors have been observed during normal operation, * but the metadata has not been checked thoroughly * - !checked && !sick => has not been examined since mount + * + * Evidence of health problems can be sorted into three basic categories: + * + * a) Primary evidence, which signals that something is defective within the + * general grouping of metadata. + * + * b) Secondary evidence, which are side effects of primary problem but are + * not themselves problems. These can be forgotten when the primary + * health problems are addressed. + * + * c) Indirect evidence, which points to something being wrong in another + * group, but we had to release resources and this is all that's left of + * that state. */ struct xfs_mount; @@ -115,6 +128,36 @@ struct xfs_da_args; XFS_SICK_INO_DIR_ZAPPED | \ XFS_SICK_INO_SYMLINK_ZAPPED) +/* Secondary state related to (but not primary evidence of) health problems. */ +#define XFS_SICK_FS_SECONDARY (0) +#define XFS_SICK_RT_SECONDARY (0) +#define XFS_SICK_AG_SECONDARY (0) +#define XFS_SICK_INO_SECONDARY (0) + +/* Evidence of health problems elsewhere. */ +#define XFS_SICK_FS_INDIRECT (0) +#define XFS_SICK_RT_INDIRECT (0) +#define XFS_SICK_AG_INDIRECT (0) +#define XFS_SICK_INO_INDIRECT (0) + +/* All health masks. */ +#define XFS_SICK_FS_ALL (XFS_SICK_FS_PRIMARY | \ + XFS_SICK_FS_SECONDARY | \ + XFS_SICK_FS_INDIRECT) + +#define XFS_SICK_RT_ALL (XFS_SICK_RT_PRIMARY | \ + XFS_SICK_RT_SECONDARY | \ + XFS_SICK_RT_INDIRECT) + +#define XFS_SICK_AG_ALL (XFS_SICK_AG_PRIMARY | \ + XFS_SICK_AG_SECONDARY | \ + XFS_SICK_AG_INDIRECT) + +#define XFS_SICK_INO_ALL (XFS_SICK_INO_PRIMARY | \ + XFS_SICK_INO_SECONDARY | \ + XFS_SICK_INO_INDIRECT | \ + XFS_SICK_INO_ZAPPED) + /* * These functions must be provided by the xfs implementation. Function * behavior with respect to the first argument should be as follows: diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index bbdc621e5ee5..a264547fcd94 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -97,7 +97,7 @@ xfs_fs_mark_sick( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_sick(mp, mask); spin_lock(&mp->m_sb_lock); @@ -111,7 +111,7 @@ xfs_fs_mark_corrupt( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_corrupt(mp, mask); spin_lock(&mp->m_sb_lock); @@ -126,11 +126,13 @@ xfs_fs_mark_healthy( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_FS_ALL)); trace_xfs_fs_mark_healthy(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_fs_sick &= ~mask; + if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY)) + mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY; mp->m_fs_checked |= mask; spin_unlock(&mp->m_sb_lock); } @@ -154,7 +156,7 @@ xfs_rt_mark_sick( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_RT_ALL)); trace_xfs_rt_mark_sick(mp, mask); spin_lock(&mp->m_sb_lock); @@ -168,7 +170,7 @@ xfs_rt_mark_corrupt( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_RT_ALL)); trace_xfs_rt_mark_corrupt(mp, mask); spin_lock(&mp->m_sb_lock); @@ -183,11 +185,13 @@ xfs_rt_mark_healthy( struct xfs_mount *mp, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_RT_ALL)); trace_xfs_rt_mark_healthy(mp, mask); spin_lock(&mp->m_sb_lock); mp->m_rt_sick &= ~mask; + if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY)) + mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY; mp->m_rt_checked |= mask; spin_unlock(&mp->m_sb_lock); } @@ -228,7 +232,7 @@ xfs_ag_mark_sick( struct xfs_perag *pag, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask); spin_lock(&pag->pag_state_lock); @@ -242,7 +246,7 @@ xfs_ag_mark_corrupt( struct xfs_perag *pag, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask); spin_lock(&pag->pag_state_lock); @@ -257,11 +261,13 @@ xfs_ag_mark_healthy( struct xfs_perag *pag, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY)); + ASSERT(!(mask & ~XFS_SICK_AG_ALL)); trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask); spin_lock(&pag->pag_state_lock); pag->pag_sick &= ~mask; + if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY)) + pag->pag_sick &= ~XFS_SICK_AG_SECONDARY; pag->pag_checked |= mask; spin_unlock(&pag->pag_state_lock); } @@ -285,7 +291,7 @@ xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); @@ -308,7 +314,7 @@ xfs_inode_mark_corrupt( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_corrupt(ip, mask); spin_lock(&ip->i_flags_lock); @@ -332,11 +338,13 @@ xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); + ASSERT(!(mask & ~XFS_SICK_INO_ALL)); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); ip->i_sick &= ~mask; + if (!(ip->i_sick & XFS_SICK_INO_PRIMARY)) + ip->i_sick &= ~XFS_SICK_INO_SECONDARY; ip->i_checked |= mask; spin_unlock(&ip->i_flags_lock); } -- cgit v1.2.3 From 0e24ec3c56fbc797b34fc94073320c336336b4f9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:33:03 -0800 Subject: xfs: remember sick inodes that get inactivated If an unhealthy inode gets inactivated, remember this fact in the per-fs health summary. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_fs.h | 1 + fs/xfs/libxfs/xfs_health.h | 8 ++++++-- fs/xfs/libxfs/xfs_inode_buf.c | 2 +- fs/xfs/scrub/health.c | 16 +++++++++++++--- fs/xfs/xfs_health.c | 1 + fs/xfs/xfs_inode.c | 35 +++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trace.h | 1 + 7 files changed, 58 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 515cd27d3b3a..b5c8da7e6aa9 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -294,6 +294,7 @@ struct xfs_ag_geometry { #define XFS_AG_GEOM_SICK_FINOBT (1 << 7) /* free inode index */ #define XFS_AG_GEOM_SICK_RMAPBT (1 << 8) /* reverse mappings */ #define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9) /* reference counts */ +#define XFS_AG_GEOM_SICK_INODES (1 << 10) /* bad inodes were seen */ /* * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 032d45fcbd51..3c64b5f9bd68 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -76,6 +76,7 @@ struct xfs_da_args; #define XFS_SICK_AG_FINOBT (1 << 7) /* free inode index */ #define XFS_SICK_AG_RMAPBT (1 << 8) /* reverse mappings */ #define XFS_SICK_AG_REFCNTBT (1 << 9) /* reference counts */ +#define XFS_SICK_AG_INODES (1 << 10) /* inactivated bad inodes */ /* Observable health issues for inode metadata. */ #define XFS_SICK_INO_CORE (1 << 0) /* inode core */ @@ -92,6 +93,9 @@ struct xfs_da_args; #define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */ #define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */ +/* Don't propagate sick status to ag health summary during inactivation */ +#define XFS_SICK_INO_FORGET (1 << 12) + /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ @@ -132,12 +136,12 @@ struct xfs_da_args; #define XFS_SICK_FS_SECONDARY (0) #define XFS_SICK_RT_SECONDARY (0) #define XFS_SICK_AG_SECONDARY (0) -#define XFS_SICK_INO_SECONDARY (0) +#define XFS_SICK_INO_SECONDARY (XFS_SICK_INO_FORGET) /* Evidence of health problems elsewhere. */ #define XFS_SICK_FS_INDIRECT (0) #define XFS_SICK_RT_INDIRECT (0) -#define XFS_SICK_AG_INDIRECT (0) +#define XFS_SICK_AG_INDIRECT (XFS_SICK_AG_INODES) #define XFS_SICK_INO_INDIRECT (0) /* All health masks. */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 1280d6acd1c1..d0dcce462bf4 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -139,7 +139,7 @@ xfs_imap_to_bp( imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), - XFS_SICK_AG_INOBT); + XFS_SICK_AG_INODES); return error; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index c682c7093353..af192b41e78c 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -183,9 +183,19 @@ xchk_update_health( case XHG_INO: if (!sc->ip) return; - if (bad) - xfs_inode_mark_corrupt(sc->ip, sc->sick_mask); - else + if (bad) { + unsigned int mask = sc->sick_mask; + + /* + * If we're coming in for repairs then we don't want + * sickness flags to propagate to the incore health + * status if the inode gets inactivated before we can + * fix it. + */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + mask |= XFS_SICK_INO_FORGET; + xfs_inode_mark_corrupt(sc->ip, mask); + } else xfs_inode_mark_healthy(sc->ip, sc->sick_mask); break; case XHG_FS: diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index a264547fcd94..855221c0dd34 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -431,6 +431,7 @@ static const struct ioctl_sick_map ag_map[] = { { XFS_SICK_AG_FINOBT, XFS_AG_GEOM_SICK_FINOBT }, { XFS_SICK_AG_RMAPBT, XFS_AG_GEOM_SICK_RMAPBT }, { XFS_SICK_AG_REFCNTBT, XFS_AG_GEOM_SICK_REFCNTBT }, + { XFS_SICK_AG_INODES, XFS_AG_GEOM_SICK_INODES }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5d0ac5c733f3..ea48774f6b76 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1738,6 +1738,39 @@ xfs_inode_needs_inactive( return xfs_can_free_eofblocks(ip, true); } +/* + * Save health status somewhere, if we're dumping an inode with uncorrected + * errors and online repair isn't running. + */ +static inline void +xfs_inactive_health( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned int sick; + unsigned int checked; + + xfs_inode_measure_sickness(ip, &sick, &checked); + if (!sick) + return; + + trace_xfs_inode_unfixed_corruption(ip, sick); + + if (sick & XFS_SICK_INO_FORGET) + return; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + if (!pag) { + /* There had better still be a perag structure! */ + ASSERT(0); + return; + } + + xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES); + xfs_perag_put(pag); +} + /* * xfs_inactive * @@ -1766,6 +1799,8 @@ xfs_inactive( mp = ip->i_mount; ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); + xfs_inactive_health(ip); + /* * If this is a read-only mount, don't do this (would generate I/O) * unless we're in log recovery and cleaning the iunlinked list. diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 667ecae469cc..8ea6a6e712b3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3985,6 +3985,7 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name, \ DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt); DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy); +DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption); TRACE_EVENT(xfs_iwalk_ag, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, -- cgit v1.2.3 From a1f3e0cca41036c3c66abb6a2ed8fedc214e9a4c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:33:04 -0800 Subject: xfs: update health status if we get a clean bill of health If scrub finds that everything is ok with the filesystem, we need a way to tell the health tracking that it can let go of indirect health flags, since indirect flags only mean that at some point in the past we lost some context. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_fs.h | 3 ++- fs/xfs/scrub/health.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/health.h | 1 + fs/xfs/scrub/repair.c | 1 + fs/xfs/scrub/scrub.c | 6 +++++ fs/xfs/scrub/trace.h | 4 +++- 6 files changed, 77 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b5c8da7e6aa9..ca1b17d01437 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -714,9 +714,10 @@ struct xfs_scrub_metadata { #define XFS_SCRUB_TYPE_FSCOUNTERS 24 /* fs summary counters */ #define XFS_SCRUB_TYPE_QUOTACHECK 25 /* quota counters */ #define XFS_SCRUB_TYPE_NLINKS 26 /* inode link counts */ +#define XFS_SCRUB_TYPE_HEALTHY 27 /* everything checked out ok */ /* Number of scrub subcommands. */ -#define XFS_SCRUB_TYPE_NR 27 +#define XFS_SCRUB_TYPE_NR 28 /* i: Repair this metadata. */ #define XFS_SCRUB_IFLAG_REPAIR (1u << 0) diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index af192b41e78c..7878da941c12 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -14,6 +14,7 @@ #include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/health.h" +#include "scrub/common.h" /* * Scrub and In-Core Filesystem Health Assessments @@ -149,6 +150,24 @@ xchk_file_looks_zapped( return xfs_inode_has_sickness(sc->ip, mask); } +/* + * Scrub gave the filesystem a clean bill of health, so clear all the indirect + * markers of past problems (at least for the fs and ags) so that we can be + * healthy again. + */ +STATIC void +xchk_mark_all_healthy( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT); + xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT); + for_each_perag(mp, agno, pag) + xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT); +} + /* * Update filesystem health assessments based on what we found and did. * @@ -166,6 +185,18 @@ xchk_update_health( struct xfs_perag *pag; bool bad; + /* + * The HEALTHY scrub type is a request from userspace to clear all the + * indirect flags after a clean scan of the entire filesystem. As such + * there's no sick flag defined for it, so we branch here ahead of the + * mask check. + */ + if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY && + !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { + xchk_mark_all_healthy(sc->mp); + return; + } + if (!sc->sick_mask) return; @@ -285,3 +316,36 @@ xchk_ag_btree_healthy_enough( return true; } + +/* + * Quick scan to double-check that there isn't any evidence of lingering + * primary health problems. If we're still clear, then the health update will + * take care of clearing the indirect evidence. + */ +int +xchk_health_record( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + unsigned int sick; + unsigned int checked; + + xfs_fs_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_FS_PRIMARY) + xchk_set_corrupt(sc); + + xfs_rt_measure_sickness(mp, &sick, &checked); + if (sick & XFS_SICK_RT_PRIMARY) + xchk_set_corrupt(sc); + + for_each_perag(mp, agno, pag) { + xfs_ag_measure_sickness(pag, &sick, &checked); + if (sick & XFS_SICK_AG_PRIMARY) + xchk_set_corrupt(sc); + } + + return 0; +} diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index a731b2467399..06d17941776c 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -12,5 +12,6 @@ bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, xfs_btnum_t btnum); void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); +int xchk_health_record(struct xfs_scrub *sc); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 7141b1778902..ab510cea96d8 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -30,6 +30,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_reflink.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c0b99184bb3e..0f23b7f36d4a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -378,6 +378,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_nlinks, .repair = xrep_nlinks, }, + [XFS_SCRUB_TYPE_HEALTHY] = { /* fs healthy; clean all reminders */ + .type = ST_FS, + .setup = xchk_setup_fs, + .scrub = xchk_health_record, + .repair = xrep_notsupported, + }, }; static int diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c9b6b0e0bf11..b4e65f148e7b 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -69,6 +69,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -97,7 +98,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS); { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" }, \ { XFS_SCRUB_TYPE_QUOTACHECK, "quotacheck" }, \ - { XFS_SCRUB_TYPE_NLINKS, "nlinks" } + { XFS_SCRUB_TYPE_NLINKS, "nlinks" }, \ + { XFS_SCRUB_TYPE_HEALTHY, "healthy" } #define XFS_SCRUB_FLAG_STRINGS \ { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ -- cgit v1.2.3 From 4ed080cd7cb077bbb4b64f0712be1618c9d55a0d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:33:05 -0800 Subject: xfs: repair summary counters Use the same summary counter calculation infrastructure to generate new values for the in-core summary counters. The difference between the scrubber and the repairer is that the repairer will freeze the fs during setup, which means that the values should match exactly. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/fscounters.c | 27 +++++++-------- fs/xfs/scrub/fscounters.h | 20 +++++++++++ fs/xfs/scrub/fscounters_repair.c | 72 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 ++ fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 21 +++++++++--- 8 files changed, 128 insertions(+), 18 deletions(-) create mode 100644 fs/xfs/scrub/fscounters.h create mode 100644 fs/xfs/scrub/fscounters_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 253744092915..ba8608f469ac 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -191,6 +191,7 @@ xfs-y += $(addprefix scrub/, \ alloc_repair.o \ bmap_repair.o \ cow_repair.o \ + fscounters_repair.o \ ialloc_repair.o \ inode_repair.o \ newbt.o \ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 893c5a6e3ddb..d310737c8823 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -22,6 +22,7 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" +#include "scrub/fscounters.h" /* * FS Summary Counters @@ -48,17 +49,6 @@ * our tolerance for mismatch between expected and actual counter values. */ -struct xchk_fscounters { - struct xfs_scrub *sc; - uint64_t icount; - uint64_t ifree; - uint64_t fdblocks; - uint64_t frextents; - unsigned long long icount_min; - unsigned long long icount_max; - bool frozen; -}; - /* * Since the expected value computation is lockless but only browses incore * values, the percpu counters should be fairly close to each other. However, @@ -235,8 +225,13 @@ xchk_setup_fscounters( * Pause all writer activity in the filesystem while we're scrubbing to * reduce the likelihood of background perturbations to the counters * throwing off our calculations. + * + * If we're repairing, we need to prevent any other thread from + * changing the global fs summary counters while we're repairing them. + * This requires the fs to be frozen, which will disable background + * reclaim and purge all inactive inodes. */ - if (sc->flags & XCHK_TRY_HARDER) { + if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) { error = xchk_fscounters_freeze(sc); if (error) return error; @@ -254,7 +249,9 @@ xchk_setup_fscounters( * set the INCOMPLETE flag even when a negative errno is returned. This care * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, * ECANCELED) that are absorbed into a scrub state flag update by - * xchk_*_process_error. + * xchk_*_process_error. Scrub and repair share the same incore data + * structures, so the INCOMPLETE flag is critical to prevent a repair based on + * insufficient information. */ /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ @@ -482,6 +479,10 @@ xchk_fscount_within_range( if (curr_value == expected) return true; + /* We require exact matches when repair is running. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return false; + min_value = min(old_value, curr_value); max_value = max(old_value, curr_value); diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h new file mode 100644 index 000000000000..461a13d25f4b --- /dev/null +++ b/fs/xfs/scrub/fscounters.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_FSCOUNTERS_H__ +#define __XFS_SCRUB_FSCOUNTERS_H__ + +struct xchk_fscounters { + struct xfs_scrub *sc; + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + uint64_t frextents; + unsigned long long icount_min; + unsigned long long icount_max; + bool frozen; +}; + +#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */ diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c new file mode 100644 index 000000000000..94cdb852bee4 --- /dev/null +++ b/fs/xfs/scrub/fscounters_repair.c @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/fscounters.h" + +/* + * FS Summary Counters + * =================== + * + * We correct errors in the filesystem summary counters by setting them to the + * values computed during the obligatory scrub phase. However, we must be + * careful not to allow any other thread to change the counters while we're + * computing and setting new values. To achieve this, we freeze the + * filesystem for the whole operation if the REPAIR flag is set. The checking + * function is stricter when we've frozen the fs. + */ + +/* + * Reset the superblock counters. Caller is responsible for freezing the + * filesystem during the calculation and reset phases. + */ +int +xrep_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + + /* + * Reinitialize the in-core counters from what we computed. We froze + * the filesystem, so there shouldn't be anyone else trying to modify + * these counters. + */ + if (!fsc->frozen) { + ASSERT(fsc->frozen); + return -EFSCORRUPTED; + } + + trace_xrep_reset_counters(mp, fsc); + + percpu_counter_set(&mp->m_icount, fsc->icount); + percpu_counter_set(&mp->m_ifree, fsc->ifree); + percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + percpu_counter_set(&mp->m_frextents, fsc->frextents); + mp->m_sb.sb_frextents = fsc->frextents; + + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 8edac0150e96..2ff2bb79c540 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -117,6 +117,7 @@ int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); int xrep_nlinks(struct xfs_scrub *sc); +int xrep_fscounters(struct xfs_scrub *sc); #ifdef CONFIG_XFS_RT int xrep_rtbitmap(struct xfs_scrub *sc); @@ -198,6 +199,7 @@ xrep_setup_nothing( #define xrep_quota xrep_notsupported #define xrep_quotacheck xrep_notsupported #define xrep_nlinks xrep_notsupported +#define xrep_fscounters xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 0f23b7f36d4a..aeac9cae4ad4 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -364,7 +364,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_fscounters, .scrub = xchk_fscounters, - .repair = xrep_notsupported, + .repair = xrep_fscounters, }, [XFS_SCRUB_TYPE_QUOTACHECK] = { /* quota counters */ .type = ST_FS, diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 2d5a330afe10..b8f3795f7d9b 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -24,6 +24,7 @@ #include "scrub/quota.h" #include "scrub/iscan.h" #include "scrub/nlinks.h" +#include "scrub/fscounters.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b4e65f148e7b..1e448d0c5aee 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -24,6 +24,7 @@ struct xfarray_sortinfo; struct xchk_dqiter; struct xchk_iscan; struct xchk_nlink; +struct xchk_fscounters; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -1804,16 +1805,28 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->refcbt_sz) ) TRACE_EVENT(xrep_reset_counters, - TP_PROTO(struct xfs_mount *mp), - TP_ARGS(mp), + TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc), + TP_ARGS(mp, fsc), TP_STRUCT__entry( __field(dev_t, dev) + __field(uint64_t, icount) + __field(uint64_t, ifree) + __field(uint64_t, fdblocks) + __field(uint64_t, frextents) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __entry->icount = fsc->icount; + __entry->ifree = fsc->ifree; + __entry->fdblocks = fsc->fdblocks; + __entry->frextents = fsc->frextents; ), - TP_printk("dev %d:%d", - MAJOR(__entry->dev), MINOR(__entry->dev)) + TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount, + __entry->ifree, + __entry->fdblocks, + __entry->frextents) ) DECLARE_EVENT_CLASS(xrep_newbt_extent_class, -- cgit v1.2.3 From 78067b92b9096a70ca731a6cde1c286582ff03d7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:33:06 -0800 Subject: xfs: consolidate btree block freeing tracepoints Don't waste memory on extra per-btree block freeing tracepoints when we can do it from the generic btree code. With this patch applied, two tracepoints are collapsed into one tracepoint, with the following effects on objdump -hx xfs.ko output: Before: 10 __tracepoints_ptrs 00000b3c 0000000000000000 0000000000000000 00140eb0 2**2 14 __tracepoints_strings 00005453 0000000000000000 0000000000000000 00168540 2**5 29 __tracepoints 00010d90 0000000000000000 0000000000000000 0023f5e0 2**5 After: 10 __tracepoints_ptrs 00000b38 0000000000000000 0000000000000000 001412f0 2**2 14 __tracepoints_strings 00005433 0000000000000000 0000000000000000 001689a0 2**5 29 __tracepoints 00010d30 0000000000000000 0000000000000000 0023fe00 2**5 Column 3 is the section size in bytes; removing these two tracepoints reduces the size of the ELF segments by 132 bytes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 2 ++ fs/xfs/libxfs/xfs_refcount_btree.c | 2 -- fs/xfs/libxfs/xfs_rmap_btree.c | 2 -- fs/xfs/xfs_trace.h | 32 ++++++++++++++++++++++++++++++-- 4 files changed, 32 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 70d406dbb3e4..fd86996aa626 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -414,6 +414,8 @@ xfs_btree_free_block( { int error; + trace_xfs_btree_free_block(cur, bp); + error = cur->bc_ops->free_block(cur, bp); if (!error) { xfs_trans_binval(cur->bc_tp, bp); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 0d80bd99147c..a346e49981ac 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -107,8 +107,6 @@ xfs_refcountbt_free_block( struct xfs_agf *agf = agbp->b_addr; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, - XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1); be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 6c81b20e97d2..0dc086bc528f 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -125,8 +125,6 @@ xfs_rmapbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, - bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8ea6a6e712b3..fc6190340b95 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2496,6 +2496,36 @@ DEFINE_EVENT(xfs_btree_cur_class, name, \ DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys); DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range); +TRACE_EVENT(xfs_btree_free_block, + TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp), + TP_ARGS(cur, bp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_btnum_t, btnum) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + __entry->agno = xfs_daddr_to_agno(cur->bc_mp, + xfs_buf_daddr(bp)); + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + __entry->ino = cur->bc_ino.ip->i_ino; + else + __entry->ino = 0; + __entry->btnum = cur->bc_btnum; + __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp)); + ), + TP_printk("dev %d:%d btree %s agno 0x%x ino 0x%llx agbno 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->ino, + __entry->agbno) +); + /* deferred ops */ struct xfs_defer_pending; @@ -2859,7 +2889,6 @@ DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer); DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block); -DEFINE_BUSY_EVENT(xfs_rmapbt_free_block); DEFINE_RMAPBT_EVENT(xfs_rmap_update); DEFINE_RMAPBT_EVENT(xfs_rmap_insert); DEFINE_RMAPBT_EVENT(xfs_rmap_delete); @@ -3218,7 +3247,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ /* refcount btree tracepoints */ DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); -DEFINE_BUSY_EVENT(xfs_refcountbt_free_block); DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); -- cgit v1.2.3 From 2ed0b2c7f33159825af1a1a83face66edb52348a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:33:07 -0800 Subject: xfs: consolidate btree block allocation tracepoints Don't waste tracepoint segment memory on per-btree block allocation tracepoints when we can do it from the generic btree code. With this patch applied, two tracepoints are collapsed into one tracepoint, with the following effects on objdump -hx xfs.ko output: Before: 10 __tracepoints_ptrs 00000b38 0000000000000000 0000000000000000 001412f0 2**2 14 __tracepoints_strings 00005433 0000000000000000 0000000000000000 001689a0 2**5 29 __tracepoints 00010d30 0000000000000000 0000000000000000 0023fe00 2**5 After: 10 __tracepoints_ptrs 00000b34 0000000000000000 0000000000000000 001417b0 2**2 14 __tracepoints_strings 00005413 0000000000000000 0000000000000000 00168e80 2**5 29 __tracepoints 00010cd0 0000000000000000 0000000000000000 00240760 2**5 Column 3 is the section size in bytes; removing these two tracepoints reduces the size of the ELF segments by 132 bytes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 20 +++++++++++++--- fs/xfs/libxfs/xfs_refcount_btree.c | 2 -- fs/xfs/libxfs/xfs_rmap_btree.c | 2 -- fs/xfs/xfs_trace.h | 49 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 64 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index fd86996aa626..322553f1981d 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2693,6 +2693,20 @@ error1: return error; } +static inline int +xfs_btree_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *hint_block, + union xfs_btree_ptr *new_block, + int *stat) +{ + int error; + + error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); + trace_xfs_btree_alloc_block(cur, new_block, *stat, error); + return error; +} + /* * Split cur/level block in half. * Return new block number and the key to its first @@ -2736,7 +2750,7 @@ __xfs_btree_split( xfs_btree_buf_to_ptr(cur, lbp, &lptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); + error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat); if (error) goto error0; if (*stat == 0) @@ -3016,7 +3030,7 @@ xfs_btree_new_iroot( pp = xfs_btree_ptr_addr(cur, 1, block); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); + error = xfs_btree_alloc_block(cur, pp, &nptr, stat); if (error) goto error0; if (*stat == 0) @@ -3116,7 +3130,7 @@ xfs_btree_new_root( cur->bc_ops->init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ - error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); + error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); if (error) goto error0; if (*stat == 0) diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index a346e49981ac..f904a92d1b59 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -77,8 +77,6 @@ xfs_refcountbt_alloc_block( xfs_refc_block(args.mp))); if (error) goto out_error; - trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, - args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { *stat = 0; return 0; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 0dc086bc528f..43ff2236f623 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -94,8 +94,6 @@ xfs_rmapbt_alloc_block( &bno, 1); if (error) return error; - - trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1); if (bno == NULLAGBLOCK) { *stat = 0; return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index fc6190340b95..5115037c127b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2496,6 +2496,53 @@ DEFINE_EVENT(xfs_btree_cur_class, name, \ DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys); DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range); +TRACE_EVENT(xfs_btree_alloc_block, + TP_PROTO(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int stat, + int error), + TP_ARGS(cur, ptr, stat, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_btnum_t, btnum) + __field(int, error) + __field(xfs_agblock_t, agbno) + ), + TP_fast_assign( + __entry->dev = cur->bc_mp->m_super->s_dev; + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + __entry->agno = 0; + __entry->ino = cur->bc_ino.ip->i_ino; + } else { + __entry->agno = cur->bc_ag.pag->pag_agno; + __entry->ino = 0; + } + __entry->btnum = cur->bc_btnum; + __entry->error = error; + if (!error && stat) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + xfs_fsblock_t fsb = be64_to_cpu(ptr->l); + + __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, + fsb); + __entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, + fsb); + } else { + __entry->agbno = be32_to_cpu(ptr->s); + } + } else { + __entry->agbno = NULLAGBLOCK; + } + ), + TP_printk("dev %d:%d btree %s agno 0x%x ino 0x%llx agbno 0x%x error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __entry->agno, + __entry->ino, + __entry->agbno, + __entry->error) +); + TRACE_EVENT(xfs_btree_free_block, TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp), TP_ARGS(cur, bp), @@ -2888,7 +2935,6 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \ DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer); DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); -DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block); DEFINE_RMAPBT_EVENT(xfs_rmap_update); DEFINE_RMAPBT_EVENT(xfs_rmap_insert); DEFINE_RMAPBT_EVENT(xfs_rmap_delete); @@ -3246,7 +3292,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \ TP_ARGS(mp, agno, i1, i2, i3)) /* refcount btree tracepoints */ -DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block); DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get); DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update); -- cgit v1.2.3 From 056d22c87132cf4968f5e702116439bea9795930 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:33:18 -0800 Subject: xfs: set the btree cursor bc_ops in xfs_btree_alloc_cursor This is a precursor to putting more static data in the btree ops structure. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc_btree.c | 11 +++++------ fs/xfs/libxfs/xfs_bmap_btree.c | 3 +-- fs/xfs/libxfs/xfs_btree.h | 2 ++ fs/xfs/libxfs/xfs_ialloc_btree.c | 10 ++++++---- fs/xfs/libxfs/xfs_refcount_btree.c | 4 ++-- fs/xfs/libxfs/xfs_rmap_btree.c | 3 +-- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a7032bf0cd37..cdcb2358351c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -512,18 +512,17 @@ xfs_allocbt_init_common( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels, - xfs_allocbt_cur_cache); - cur->bc_ag.abt.active = false; - if (btnum == XFS_BTNUM_CNT) { - cur->bc_ops = &xfs_cntbt_ops; + cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_cntbt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { - cur->bc_ops = &xfs_bnobt_ops; + cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_bnobt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } + cur->bc_ag.abt.active = false; cur->bc_ag.pag = xfs_perag_hold(pag); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 71f2d50f7823..19414c711886 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -549,11 +549,10 @@ xfs_bmbt_init_common( ASSERT(whichfork != XFS_COW_FORK); - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, &xfs_bmbt_ops, mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); - cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 75a0e2c8e115..c053fb934dc7 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -720,6 +720,7 @@ xfs_btree_alloc_cursor( struct xfs_mount *mp, struct xfs_trans *tp, xfs_btnum_t btnum, + const struct xfs_btree_ops *ops, uint8_t maxlevels, struct kmem_cache *cache) { @@ -728,6 +729,7 @@ xfs_btree_alloc_cursor( /* BMBT allocations can come through from non-transactional context. */ cur = kmem_cache_zalloc(cache, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); + cur->bc_ops = ops; cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 42a5e1f227a0..8b705cc62d3d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -454,14 +454,16 @@ xfs_inobt_init_common( struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, btnum, - M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); if (btnum == XFS_BTNUM_INO) { + cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_inobt_ops, + M_IGEO(mp)->inobt_maxlevels, + xfs_inobt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); - cur->bc_ops = &xfs_inobt_ops; } else { + cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_finobt_ops, + M_IGEO(mp)->inobt_maxlevels, + xfs_inobt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); - cur->bc_ops = &xfs_finobt_ops; } if (xfs_has_crc(mp)) diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index f904a92d1b59..1eb164816825 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -353,7 +353,8 @@ xfs_refcountbt_init_common( ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC, - mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); + &xfs_refcountbt_ops, mp->m_refc_maxlevels, + xfs_refcountbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -361,7 +362,6 @@ xfs_refcountbt_init_common( cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; - cur->bc_ops = &xfs_refcountbt_ops; return cur; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 43ff2236f623..370c36921f65 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -503,11 +503,10 @@ xfs_rmapbt_init_common( struct xfs_btree_cur *cur; /* Overlapping btree; 2 keys per pointer. */ - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); - cur->bc_ops = &xfs_rmapbt_ops; cur->bc_ag.pag = xfs_perag_hold(pag); return cur; -- cgit v1.2.3 From f9e325bf61d1fb3ef5f705268a22de95809db9fa Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:34:12 -0800 Subject: xfs: drop XFS_BTREE_CRC_BLOCKS All existing btree types set XFS_BTREE_CRC_BLOCKS when running against a V5 filesystem. All currently proposed btree types are V5 only and use the richer XFS_BTREE_CRC_BLOCKS format. Therefore, we can drop this flag and change the conditional to xfs_has_crc. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc_btree.c | 3 --- fs/xfs/libxfs/xfs_bmap_btree.c | 2 -- fs/xfs/libxfs/xfs_btree.c | 8 ++++---- fs/xfs/libxfs/xfs_btree.h | 1 - fs/xfs/libxfs/xfs_ialloc_btree.c | 3 --- fs/xfs/libxfs/xfs_refcount_btree.c | 2 -- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- 7 files changed, 5 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index cdcb2358351c..93a6be0d6cde 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -526,9 +526,6 @@ xfs_allocbt_init_common( cur->bc_ag.pag = xfs_perag_hold(pag); - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - return cur; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 19414c711886..0bc64c07fa1c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -554,8 +554,6 @@ xfs_bmbt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 322553f1981d..45b5cfffa8dd 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -598,11 +598,11 @@ xfs_btree_dup_cursor( static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_LBLOCK_CRC_LEN; return XFS_BTREE_LBLOCK_LEN; } - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_SBLOCK_CRC_LEN; return XFS_BTREE_SBLOCK_LEN; } @@ -1576,7 +1576,7 @@ xfs_btree_log_block( if (bp) { int nbits; - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (xfs_has_crc(cur->bc_mp)) { /* * We don't log the CRC when updating a btree * block but instead recreate it during log @@ -3048,7 +3048,7 @@ xfs_btree_new_iroot( * In that case have to also ensure the blkno remains correct */ memcpy(cblock, block, xfs_btree_block_len(cur)); - if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (xfs_has_crc(cur->bc_mp)) { __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) cblock->bb_u.l.bb_blkno = bno; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c053fb934dc7..36fd07b32daf 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -308,7 +308,6 @@ xfs_btree_cur_sizeof(unsigned int nlevels) #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ #define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ #define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ -#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ #define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ /* * The root of this btree is a fakeroot structure so that we can stage a btree diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 8b705cc62d3d..3e65f028f3ee 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -466,9 +466,6 @@ xfs_inobt_init_common( cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); } - if (xfs_has_crc(mp)) - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 1eb164816825..6a3a827dd366 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -357,8 +357,6 @@ xfs_refcountbt_init_common( xfs_refcountbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 370c36921f65..058305fb070b 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -505,7 +505,7 @@ xfs_rmapbt_init_common( /* Overlapping btree; 2 keys per pointer. */ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; + cur->bc_flags = XFS_BTREE_OVERLAPPING; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ag.pag = xfs_perag_hold(pag); -- cgit v1.2.3 From c0afba9a8363f17d4efed22a8764df33389aebe8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:34:13 -0800 Subject: xfs: fix imprecise logic in xchk_btree_check_block_owner A reviewer was confused by the init_sa logic in this function. Upon checking the logic, I discovered that the code is imprecise. What we want to do here is check that there is an ownership record in the rmap btree for the AG that contains a btree block. For an inode-rooted btree (e.g. the bmbt) the per-AG btree cursors have not been initialized because inode btrees can span multiple AGs. Therefore, we must initialize the per-AG btree cursors in sc->sa before proceeding. That is what init_sa controls, and hence the logic should be gated on XFS_BTREE_ROOT_IN_INODE, not XFS_BTREE_LONG_PTRS. In practice, ROOT_IN_INODE and LONG_PTRS are coincident so this hasn't mattered. However, we're about to refactor both of those flags into separate btree_ops fields so we want this the logic to make sense afterwards. Fixes: 858333dcf021a ("xfs: check btree block ownership with bnobt/rmapbt when scrubbing btree") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/btree.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 1935b9ce1885..c3a9f33e5a8d 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -385,7 +385,12 @@ xchk_btree_check_block_owner( agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); - init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; + /* + * If the btree being examined is not itself a per-AG btree, initialize + * sc->sa so that we can check for the presence of an ownership record + * in the rmap btree for the AG containing the block. + */ + init_sa = bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE; if (init_sa) { error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, -- cgit v1.2.3 From fd9c7f7722d815527269b80d9990aecffa06957c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:34:29 -0800 Subject: xfs: encode the btree geometry flags in the btree ops structure Certain btree flags never change for the life of a btree cursor because they describe the geometry of the btree itself. Encode these in the btree ops structure and reduce the amount of code required in each btree type's init_cursor functions. This also frees up most of the bits in bc_flags. A previous version of this patch also converted the open-coded flags logic to helpers. This was removed due to the pending refactoring (that follows this patch) to eliminate most of the state flags. Conversion script: sed \ -e 's/XFS_BTREE_LONG_PTRS/XFS_BTGEO_LONG_PTRS/g' \ -e 's/XFS_BTREE_ROOT_IN_INODE/XFS_BTGEO_ROOT_IN_INODE/g' \ -e 's/XFS_BTREE_LASTREC_UPDATE/XFS_BTGEO_LASTREC_UPDATE/g' \ -e 's/XFS_BTREE_OVERLAPPING/XFS_BTGEO_OVERLAPPING/g' \ -e 's/cur->bc_flags & XFS_BTGEO_/cur->bc_ops->geom_flags \& XFS_BTGEO_/g' \ -i $(git ls-files fs/xfs/*.[ch] fs/xfs/libxfs/*.[ch] fs/xfs/scrub/*.[ch]) Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc_btree.c | 4 +- fs/xfs/libxfs/xfs_bmap.c | 4 +- fs/xfs/libxfs/xfs_bmap_btree.c | 6 +-- fs/xfs/libxfs/xfs_btree.c | 110 +++++++++++++++++++------------------- fs/xfs/libxfs/xfs_btree.h | 23 ++++---- fs/xfs/libxfs/xfs_btree_staging.c | 14 ++--- fs/xfs/libxfs/xfs_btree_staging.h | 2 +- fs/xfs/libxfs/xfs_rmap_btree.c | 3 +- fs/xfs/scrub/btree.c | 24 ++++----- fs/xfs/scrub/newbt.c | 2 +- fs/xfs/scrub/trace.c | 2 +- fs/xfs/xfs_trace.h | 8 +-- 12 files changed, 104 insertions(+), 98 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 93a6be0d6cde..0ed1477187bc 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -478,6 +478,8 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { }; static const struct xfs_btree_ops xfs_cntbt_ops = { + .geom_flags = XFS_BTGEO_LASTREC_UPDATE, + .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -516,7 +518,6 @@ xfs_allocbt_init_common( cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_cntbt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); - cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_bnobt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); @@ -591,7 +592,6 @@ xfs_allocbt_commit_staged_btree( if (cur->bc_btnum == XFS_BTNUM_BNO) { xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); } else { - cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE; xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); } } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 314a2144622a..8d71c643db7d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -646,7 +646,7 @@ xfs_bmap_extents_to_btree( block = ifp->if_broot; xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, XFS_BTNUM_BMAP, 1, 1, ip->i_ino, - XFS_BTREE_LONG_PTRS); + XFS_BTGEO_LONG_PTRS); /* * Need a cursor. Can't allocate until bb_level is filled in. */ @@ -693,7 +693,7 @@ xfs_bmap_extents_to_btree( ablock = XFS_BUF_TO_BLOCK(abp); xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp), XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); + XFS_BTGEO_LONG_PTRS); for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 0bc64c07fa1c..69056fe8a51e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -46,7 +46,7 @@ xfs_bmdr_to_bmbt( xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTREE_LONG_PTRS); + XFS_BTGEO_LONG_PTRS); rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; @@ -516,6 +516,8 @@ xfs_bmbt_keys_contiguous( } static const struct xfs_btree_ops xfs_bmbt_ops = { + .geom_flags = XFS_BTGEO_LONG_PTRS | XFS_BTGEO_ROOT_IN_INODE, + .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), @@ -553,8 +555,6 @@ xfs_bmbt_init_common( mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); - cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; - cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 45b5cfffa8dd..a80f8a7a6e72 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -261,7 +261,7 @@ xfs_btree_check_block( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block, if any */ { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) return xfs_btree_check_lblock(cur, block, level, bp); else return xfs_btree_check_sblock(cur, block, level, bp); @@ -302,7 +302,7 @@ xfs_btree_check_ptr( int index, int level) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), level)) return 0; @@ -458,7 +458,7 @@ xfs_btree_del_cursor( xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kfree(cur->bc_ops); - if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) && cur->bc_ag.pag) xfs_perag_put(cur->bc_ag.pag); kmem_cache_free(cur->bc_cache, cur); } @@ -547,7 +547,7 @@ xfs_btree_dup_cursor( * record, key or pointer (xfs_btree_*_addr). Note that all addressing * inside the btree block is done using indices starting at one, not zero! * - * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing + * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing * overlapping intervals. In such a tree, records are still sorted lowest to * highest and indexed by the smallest key value that refers to the record. * However, nodes are different: each pointer has two associated keys -- one @@ -597,7 +597,7 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_LBLOCK_CRC_LEN; return XFS_BTREE_LBLOCK_LEN; @@ -612,7 +612,7 @@ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) */ static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) { - return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + return (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) ? sizeof(__be64) : sizeof(__be32); } @@ -726,7 +726,7 @@ struct xfs_ifork * xfs_btree_ifork_ptr( struct xfs_btree_cur *cur) { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; @@ -758,7 +758,7 @@ xfs_btree_get_block( int level, /* level in btree */ struct xfs_buf **bpp) /* buffer containing the block */ { - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && (level == cur->bc_nlevels - 1)) { *bpp = NULL; return xfs_btree_get_iroot(cur); @@ -1001,7 +1001,7 @@ xfs_btree_readahead( * No readahead needed if we are at the root level and the * btree root is stored in the inode. */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && (lev == cur->bc_nlevels - 1)) return 0; @@ -1011,7 +1011,7 @@ xfs_btree_readahead( cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) return xfs_btree_readahead_lblock(cur, lr, block); return xfs_btree_readahead_sblock(cur, lr, block); } @@ -1030,7 +1030,7 @@ xfs_btree_ptr_to_daddr( if (error) return error; - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { fsbno = be64_to_cpu(ptr->l); *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { @@ -1080,7 +1080,7 @@ xfs_btree_setbuf( cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) @@ -1098,7 +1098,7 @@ xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) return ptr->l == cpu_to_be64(NULLFSBLOCK); else return ptr->s == cpu_to_be32(NULLAGBLOCK); @@ -1109,7 +1109,7 @@ xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) ptr->l = cpu_to_be64(NULLFSBLOCK); else ptr->s = cpu_to_be32(NULLAGBLOCK); @@ -1127,7 +1127,7 @@ xfs_btree_get_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { if (lr == XFS_BB_RIGHTSIB) ptr->l = block->bb_u.l.bb_rightsib; else @@ -1149,7 +1149,7 @@ xfs_btree_set_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { if (lr == XFS_BB_RIGHTSIB) block->bb_u.l.bb_rightsib = ptr->l; else @@ -1171,16 +1171,16 @@ xfs_btree_init_block_int( __u16 level, __u16 numrecs, __u64 owner, - unsigned int flags) + unsigned int geom_flags) { - int crc = xfs_has_crc(mp); + bool crc = xfs_has_crc(mp); __u32 magic = xfs_btree_magic(crc, btnum); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); - if (flags & XFS_BTREE_LONG_PTRS) { + if (geom_flags & XFS_BTGEO_LONG_PTRS) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (crc) { @@ -1233,14 +1233,14 @@ xfs_btree_init_block_cur( * change in future, but is safe for current users of the generic btree * code. */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) owner = cur->bc_ino.ip->i_ino; else owner = cur->bc_ag.pag->pag_agno; xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), cur->bc_btnum, level, - numrecs, owner, cur->bc_flags); + numrecs, owner, cur->bc_ops->geom_flags); } /* @@ -1258,7 +1258,7 @@ xfs_btree_is_lastrec( if (level > 0) return 0; - if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE)) return 0; xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); @@ -1273,7 +1273,7 @@ xfs_btree_buf_to_ptr( struct xfs_buf *bp, union xfs_btree_ptr *ptr) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp))); else { @@ -1591,7 +1591,7 @@ xfs_btree_log_block( nbits = XFS_BB_NUM_BITS; } xfs_btree_offsets(fields, - (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) ? loffsets : soffsets, nbits, &first, &last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); @@ -1668,7 +1668,7 @@ xfs_btree_increment( * confused or have the tree root in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); @@ -1762,7 +1762,7 @@ xfs_btree_decrement( * or the root of the tree is in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); @@ -1810,7 +1810,7 @@ xfs_btree_lookup_get_block( int error = 0; /* special case the root block if in an inode */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && (level == cur->bc_nlevels - 1)) { *blkp = xfs_btree_get_iroot(cur); return 0; @@ -1838,7 +1838,7 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_has_crc(cur->bc_mp) && !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && - (cur->bc_flags & XFS_BTREE_LONG_PTRS) && + (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != cur->bc_ino.ip->i_ino) goto out_bad; @@ -2058,7 +2058,7 @@ xfs_btree_high_key_from_key( struct xfs_btree_cur *cur, union xfs_btree_key *key) { - ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); return (union xfs_btree_key *)((char *)key + (cur->bc_ops->key_len / 2)); } @@ -2079,7 +2079,7 @@ xfs_btree_get_leaf_keys( rec = xfs_btree_rec_addr(cur, 1, block); cur->bc_ops->init_key_from_rec(key, rec); - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { cur->bc_ops->init_high_key_from_rec(&max_hkey, rec); for (n = 2; n <= xfs_btree_get_numrecs(block); n++) { @@ -2106,7 +2106,7 @@ xfs_btree_get_node_keys( union xfs_btree_key *high; int n; - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { memcpy(key, xfs_btree_key_addr(cur, 1, block), cur->bc_ops->key_len / 2); @@ -2150,7 +2150,7 @@ xfs_btree_needs_key_update( struct xfs_btree_cur *cur, int ptr) { - return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1; + return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1; } /* @@ -2174,7 +2174,7 @@ __xfs_btree_updkeys( struct xfs_buf *bp; int ptr; - ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING); /* Exit if there aren't any parent levels to update. */ if (level + 1 >= cur->bc_nlevels) @@ -2243,7 +2243,7 @@ xfs_btree_update_keys( ASSERT(level >= 0); block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) return __xfs_btree_updkeys(cur, level, block, bp, false); /* @@ -2350,7 +2350,7 @@ xfs_btree_lshift( int error; /* error return value */ int i; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) goto out0; @@ -2478,7 +2478,7 @@ xfs_btree_lshift( * Using a temporary cursor, update the parent key values of the * block on the left. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_dup_cursor(cur, &tcur); if (error) goto error0; @@ -2546,7 +2546,7 @@ xfs_btree_rshift( int error; /* error return value */ int i; /* loop counter */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && (level == cur->bc_nlevels - 1)) goto out0; @@ -2665,7 +2665,7 @@ xfs_btree_rshift( goto error1; /* Update the parent high keys of the left block, if needed. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error1; @@ -2857,7 +2857,7 @@ __xfs_btree_split( } /* Update the parent high keys of the left block, if needed. */ - if (cur->bc_flags & XFS_BTREE_OVERLAPPING) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) { error = xfs_btree_update_keys(cur, level); if (error) goto error0; @@ -3022,7 +3022,7 @@ xfs_btree_new_iroot( XFS_BTREE_STATS_INC(cur, newroot); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); level = cur->bc_nlevels - 1; @@ -3050,7 +3050,7 @@ xfs_btree_new_iroot( memcpy(cblock, block, xfs_btree_block_len(cur)); if (xfs_has_crc(cur->bc_mp)) { __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) cblock->bb_u.l.bb_blkno = bno; else cblock->bb_u.s.bb_blkno = bno; @@ -3247,7 +3247,7 @@ xfs_btree_make_block_unfull( { int error = 0; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { struct xfs_inode *ip = cur->bc_ino.ip; @@ -3333,7 +3333,7 @@ xfs_btree_insrec( * If we have an external root pointer, and we've made it to the * root level, allocate a new root block and we're done. */ - if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && (level >= cur->bc_nlevels)) { error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); @@ -3621,7 +3621,7 @@ xfs_btree_kill_iroot( #endif int i; - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); ASSERT(cur->bc_nlevels > 1); /* @@ -3858,7 +3858,7 @@ xfs_btree_delrec( * nothing left to do. */ if (level == cur->bc_nlevels - 1) { - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); @@ -3926,7 +3926,7 @@ xfs_btree_delrec( xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { /* * One child of root, need to get a chance to copy its contents * into the root and delete it. Can't go up to next level, @@ -4243,7 +4243,7 @@ xfs_btree_delrec( * If we joined with the right neighbor and there's a level above * us, increment the cursor at that level. */ - else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || + else if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) || (level + 1 < cur->bc_nlevels)) { error = xfs_btree_increment(cur, level + 1, &i); if (error) @@ -4312,7 +4312,7 @@ xfs_btree_delete( * If we combined blocks as part of deleting the record, delrec won't * have updated the parent high keys so we have to do that here. */ - if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) { + if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) { error = xfs_btree_updkeys_force(cur, 0); if (error) goto error0; @@ -4409,7 +4409,7 @@ xfs_btree_visit_block( * return the same block without checking if the right sibling points * back to us and creates a cyclic reference in the btree. */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp))) { xfs_btree_mark_sick(cur); @@ -4517,7 +4517,7 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); @@ -4535,7 +4535,7 @@ xfs_btree_block_change_owner( * though, so everything is consistent in memory. */ if (!bp) { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); ASSERT(level == cur->bc_nlevels - 1); return 0; } @@ -5012,7 +5012,7 @@ xfs_btree_query_range( if (!xfs_btree_keycmp_le(cur, &low_key, &high_key)) return -EINVAL; - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, @@ -5066,7 +5066,7 @@ xfs_btree_diff_two_ptrs( const union xfs_btree_ptr *a, const union xfs_btree_ptr *b) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } @@ -5120,7 +5120,7 @@ xfs_btree_has_records_helper( key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key, &rec_key, info->key_mask); if (key_contig == XBTREE_KEY_OVERLAP && - !(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return -EFSCORRUPTED; if (key_contig == XBTREE_KEY_GAP) return -ECANCELED; @@ -5214,7 +5214,7 @@ xfs_btree_has_more_records( return true; /* There are more record blocks. */ - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); else return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 36fd07b32daf..5a292d7a7096 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -112,6 +112,9 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) } struct xfs_btree_ops { + /* XFS_BTGEO_* flags that determine the geometry of the btree */ + unsigned int geom_flags; + /* size of the key and record structures */ size_t key_len; size_t rec_len; @@ -199,6 +202,12 @@ struct xfs_btree_ops { const union xfs_btree_key *mask); }; +/* btree geometry flags */ +#define XFS_BTGEO_LONG_PTRS (1U << 0) /* pointers are 64bits long */ +#define XFS_BTGEO_ROOT_IN_INODE (1U << 1) /* root may be variable size */ +#define XFS_BTGEO_LASTREC_UPDATE (1U << 2) /* track last rec externally */ +#define XFS_BTGEO_OVERLAPPING (1U << 3) /* overlapping intervals */ + /* * Reasons for the update_lastrec method to be called. */ @@ -281,7 +290,7 @@ struct xfs_btree_cur /* * Short btree pointers need an agno to be able to turn the pointers * into physical addresses for IO, so the btree cursor switches between - * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the + * bc_ino and bc_ag based on whether XFS_BTGEO_LONG_PTRS is set for the * cursor. */ union { @@ -304,17 +313,13 @@ xfs_btree_cur_sizeof(unsigned int nlevels) return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels); } -/* cursor flags */ -#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ -#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ -#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ -#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */ +/* cursor state flags */ /* * The root of this btree is a fakeroot structure so that we can stage a btree * rebuild without leaving it accessible via primary metadata. The ops struct * is dynamically allocated and must be freed when the cursor is deleted. */ -#define XFS_BTREE_STAGING (1<<5) +#define XFS_BTREE_STAGING (1U << 0) #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 @@ -447,7 +452,7 @@ xfs_btree_init_block_int( __u16 level, __u16 numrecs, __u64 owner, - unsigned int flags); + unsigned int geom_flags); /* * Common btree core entry points. @@ -689,7 +694,7 @@ xfs_btree_islastblock( block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index f0c69f9bb169..5785e87f7880 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -136,7 +136,7 @@ xfs_btree_stage_afakeroot( struct xfs_btree_ops *nops; ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)); + ASSERT(!(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE)); ASSERT(cur->bc_tp == NULL); nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); @@ -217,7 +217,7 @@ xfs_btree_stage_ifakeroot( struct xfs_btree_ops *nops; ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); ASSERT(cur->bc_tp == NULL); nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); @@ -397,7 +397,7 @@ xfs_btree_bload_prep_block( struct xfs_btree_block *new_block; int ret; - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); size_t new_size; @@ -413,7 +413,7 @@ xfs_btree_bload_prep_block( xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, XFS_BUF_DADDR_NULL, cur->bc_btnum, level, nr_this_block, cur->bc_ino.ip->i_ino, - cur->bc_flags); + cur->bc_ops->geom_flags); *bpp = NULL; *blockp = ifp->if_broot; @@ -704,7 +704,7 @@ xfs_btree_bload_compute_geometry( xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &level_blocks, &dontcare64); - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { /* * If all the items we want to store at this level * would fit in the inode root block, then we have our @@ -763,7 +763,7 @@ xfs_btree_bload_compute_geometry( return -EOVERFLOW; bbl->btree_height = cur->bc_nlevels; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) bbl->nr_blocks = nr_blocks - 1; else bbl->nr_blocks = nr_blocks; @@ -890,7 +890,7 @@ xfs_btree_bload( } /* Initialize the new root. */ - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); cur->bc_ino.ifake->if_levels = cur->bc_nlevels; cur->bc_ino.ifake->if_blocks = total_blocks - 1; diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 055ea43b1e18..9624ae06c83c 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -76,7 +76,7 @@ struct xfs_btree_bload { /* * This function should return the size of the in-core btree root - * block. It is only necessary for XFS_BTREE_ROOT_IN_INODE btree + * block. It is only necessary for XFS_BTGEO_ROOT_IN_INODE btree * types. */ xfs_btree_bload_iroot_size_fn iroot_size; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 058305fb070b..776ab6b2be31 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -473,6 +473,8 @@ xfs_rmapbt_keys_contiguous( } static const struct xfs_btree_ops xfs_rmapbt_ops = { + .geom_flags = XFS_BTGEO_OVERLAPPING, + .rec_len = sizeof(struct xfs_rmap_rec), .key_len = 2 * sizeof(struct xfs_rmap_key), @@ -505,7 +507,6 @@ xfs_rmapbt_init_common( /* Overlapping btree; 2 keys per pointer. */ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_flags = XFS_BTREE_OVERLAPPING; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ag.pag = xfs_perag_hold(pag); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index c3a9f33e5a8d..e882919996c4 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -47,7 +47,7 @@ __xchk_btree_process_error( *error = 0; fallthrough; default: - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, *error, ret_ip); else @@ -91,7 +91,7 @@ __xchk_btree_set_corrupt( { sc->sm->sm_flags |= errflag; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) trace_xchk_ifork_btree_error(sc, cur, level, ret_ip); else @@ -168,7 +168,7 @@ xchk_btree_rec( if (xfs_btree_keycmp_lt(cur, &key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is high_key(rec) no larger than the parent high key? */ @@ -215,7 +215,7 @@ xchk_btree_key( if (xfs_btree_keycmp_lt(cur, key, keyp)) xchk_btree_set_corrupt(bs->sc, cur, level); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Is this block's high key no larger than the parent high key? */ @@ -239,12 +239,12 @@ xchk_btree_ptr_ok( bool res; /* A btree rooted in an inode has no block pointer to the root. */ - if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((bs->cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && level == bs->cur->bc_nlevels) return true; /* Otherwise, check the pointers. */ - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (bs->cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); else res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); @@ -390,7 +390,7 @@ xchk_btree_check_block_owner( * sc->sa so that we can check for the presence of an ownership record * in the rmap btree for the AG containing the block. */ - init_sa = bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE; + init_sa = bs->cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE; if (init_sa) { error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, @@ -434,7 +434,7 @@ xchk_btree_check_owner( * up. */ if (bp == NULL) { - if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE)) xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -513,7 +513,7 @@ xchk_btree_check_minrecs( * child block might be less than the standard minrecs, but that's ok * provided that there's only one direct child of the root. */ - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && level == cur->bc_nlevels - 2) { struct xfs_btree_block *root_block; struct xfs_buf *root_bp; @@ -567,7 +567,7 @@ xchk_btree_block_check_keys( return; } - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Make sure the high key of this block matches the parent. */ @@ -602,7 +602,7 @@ xchk_btree_get_block( return error; xfs_btree_get_block(bs->cur, level, pbp); - if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (bs->cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, level, *pbp); else @@ -669,7 +669,7 @@ xchk_btree_block_keys( if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys)) xchk_btree_set_corrupt(bs->sc, cur, 1); - if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING)) + if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) return; /* Get high keys */ diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index bb6d980b4fcd..84267be79dc1 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -535,7 +535,7 @@ xrep_newbt_claim_block( trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, xnr->oinfo.oi_owner); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, agbno)); else diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index b8f3795f7d9b..a7b0d95b8d5d 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -37,7 +37,7 @@ xchk_btree_cur_fsbno( xfs_buf_daddr(cur->bc_levels[level].bp)); if (level == cur->bc_nlevels - 1 && - (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) + (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE)) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); return NULLFSBLOCK; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5115037c127b..921f4b6eee89 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2510,7 +2510,7 @@ TRACE_EVENT(xfs_btree_alloc_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { __entry->agno = 0; __entry->ino = cur->bc_ino.ip->i_ino; } else { @@ -2520,7 +2520,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->btnum = cur->bc_btnum; __entry->error = error; if (!error && stat) { - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { xfs_fsblock_t fsb = be64_to_cpu(ptr->l); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, @@ -2557,7 +2557,7 @@ TRACE_EVENT(xfs_btree_free_block, __entry->dev = cur->bc_mp->m_super->s_dev; __entry->agno = xfs_daddr_to_agno(cur->bc_mp, xfs_buf_daddr(bp)); - if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) __entry->ino = cur->bc_ino.ip->i_ino; else __entry->ino = 0; @@ -4262,7 +4262,7 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->level = level; __entry->block_idx = block_idx; __entry->nr_blocks = nr_blocks; - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { xfs_fsblock_t fsb = be64_to_cpu(ptr->l); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); -- cgit v1.2.3 From e9e66df8bfa4132d905543a6b099ec8a3380b732 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:35:13 -0800 Subject: xfs: remove bc_ino.flags Just move the two flags into bc_flags where there is plenty of space. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 27 +++++++++------------------ fs/xfs/libxfs/xfs_bmap_btree.c | 14 ++++---------- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/libxfs/xfs_btree.h | 12 ++++++------ 4 files changed, 20 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8d71c643db7d..2b10198c8c03 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -651,7 +651,8 @@ xfs_bmap_extents_to_btree( * Need a cursor. Can't allocate until bb_level is filled in. */ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; + if (wasdel) + cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; /* * Convert to a btree with two levels, one record in root. */ @@ -1449,8 +1450,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(whichfork != XFS_ATTR_FORK); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!bma->cur || - (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); + ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2709,7 +2709,7 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec old; ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL)); + ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -4229,9 +4229,8 @@ xfs_bmapi_allocate( */ bma->nallocs++; - if (bma->cur) - bma->cur->bc_ino.flags = - bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0; + if (bma->cur && bma->wasdel) + bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL; bma->got.br_startoff = bma->offset; bma->got.br_startblock = bma->blkno; @@ -4766,10 +4765,8 @@ xfs_bmapi_remap( ip->i_nblocks += len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } got.br_startoff = bno; got.br_startblock = startblock; @@ -5400,7 +5397,6 @@ __xfs_bunmapi( if (ifp->if_format == XFS_DINODE_FMT_BTREE) { ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; } else cur = NULL; @@ -5861,10 +5857,8 @@ xfs_bmap_collapse_extents( if (error) return error; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) { *done = true; @@ -5978,10 +5972,8 @@ xfs_bmap_insert_extents( if (error) return error; - if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + if (ifp->if_format == XFS_DINODE_FMT_BTREE) cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; - } if (*next_fsb == NULLFSBLOCK) { xfs_iext_last(ifp, &icur); @@ -6098,7 +6090,6 @@ xfs_bmap_split_extent( if (ifp->if_format == XFS_DINODE_FMT_BTREE) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); - cur->bc_ino.flags = 0; error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) goto del_cursor; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 69056fe8a51e..48617627bc83 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -171,13 +171,8 @@ xfs_bmbt_dup_cursor( new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ino.ip, cur->bc_ino.whichfork); - - /* - * Copy the firstblock, dfops, and flags values, - * since init cursor doesn't get them. - */ - new->bc_ino.flags = cur->bc_ino.flags; - + new->bc_flags |= (cur->bc_flags & + (XFS_BTREE_BMBT_INVALID_OWNER | XFS_BTREE_BMBT_WASDEL)); return new; } @@ -211,7 +206,7 @@ xfs_bmbt_alloc_block( xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork); args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL; + args.wasdel = cur->bc_flags & XFS_BTREE_BMBT_WASDEL; if (!args.wasdel && args.tp->t_blk_res == 0) return -ENOSPC; @@ -557,7 +552,6 @@ xfs_bmbt_init_common( cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; - cur->bc_ino.flags = 0; return cur; } @@ -748,7 +742,7 @@ xfs_bmbt_change_owner( ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); - cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; + cur->bc_flags |= XFS_BTREE_BMBT_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index a80f8a7a6e72..2ba597c9ac52 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1837,7 +1837,7 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_has_crc(cur->bc_mp) && - !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && + !(cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER) && (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != cur->bc_ino.ip->i_ino) diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 5a292d7a7096..17a0324a304e 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -249,12 +249,6 @@ struct xfs_btree_cur_ino { int allocated; short forksize; char whichfork; - char flags; -/* We are converting a delalloc reservation */ -#define XFS_BTCUR_BMBT_WASDEL (1 << 0) - -/* For extent swap, ignore owner check in verifier */ -#define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) }; struct xfs_btree_level { @@ -321,6 +315,12 @@ xfs_btree_cur_sizeof(unsigned int nlevels) */ #define XFS_BTREE_STAGING (1U << 0) +/* We are converting a delalloc reservation (only for bmbt btrees) */ +#define XFS_BTREE_BMBT_WASDEL (1U << 1) + +/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */ +#define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2) + #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 -- cgit v1.2.3 From 73a8fd93c421c4a6ac2c581c4d3478d3d68a0def Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:35:14 -0800 Subject: xfs: consolidate the xfs_alloc_lookup_* helpers Add a single xfs_alloc_lookup helper to sort out the argument passing and setting of the active flag instead of duplicating the logic three times. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ac31b62e7017..a40a5e43e94c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -151,23 +151,35 @@ xfs_alloc_ag_max_usable( return mp->m_sb.sb_agblocks - blocks; } + +static int +xfs_alloc_lookup( + struct xfs_btree_cur *cur, + xfs_lookup_t dir, + xfs_agblock_t bno, + xfs_extlen_t len, + int *stat) +{ + int error; + + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + error = xfs_btree_lookup(cur, dir, stat); + cur->bc_ag.abt.active = (*stat == 1); + return error; +} + /* * Lookup the record equal to [bno, len] in the btree given by cur. */ -STATIC int /* error */ +static inline int /* error */ xfs_alloc_lookup_eq( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agblock_t bno, /* starting block of extent */ xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat); } /* @@ -181,13 +193,7 @@ xfs_alloc_lookup_ge( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat); } /* @@ -201,12 +207,7 @@ xfs_alloc_lookup_le( xfs_extlen_t len, /* length of extent */ int *stat) /* success/failure */ { - int error; - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); - cur->bc_ag.abt.active = (*stat == 1); - return error; + return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat); } static inline bool -- cgit v1.2.3 From b20775ed644af0cbaee9632ad63ae6ec5ee502cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:35:15 -0800 Subject: xfs: turn the allocbt cursor active field into a btree flag Add a new XFS_BTREE_ALLOCBT_ACTIVE flag to replace the active field. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 13 ++++++++----- fs/xfs/libxfs/xfs_alloc_btree.c | 1 - fs/xfs/libxfs/xfs_btree.h | 6 +++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a40a5e43e94c..91553a54dc30 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -165,7 +165,10 @@ xfs_alloc_lookup( cur->bc_rec.a.ar_startblock = bno; cur->bc_rec.a.ar_blockcount = len; error = xfs_btree_lookup(cur, dir, stat); - cur->bc_ag.abt.active = (*stat == 1); + if (*stat == 1) + cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; + else + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; return error; } @@ -214,7 +217,7 @@ static inline bool xfs_alloc_cur_active( struct xfs_btree_cur *cur) { - return cur && cur->bc_ag.abt.active; + return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE); } /* @@ -992,7 +995,7 @@ xfs_alloc_cur_check( deactivate = true; out: if (deactivate) - cur->bc_ag.abt.active = false; + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, *new); return 0; @@ -1367,7 +1370,7 @@ xfs_alloc_walk_iter( if (error) return error; if (i == 0) - cur->bc_ag.abt.active = false; + cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; if (count > 0) count--; @@ -1481,7 +1484,7 @@ xfs_alloc_ag_vextent_locality( if (error) return error; if (i) { - acur->cnt->bc_ag.abt.active = true; + acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; fbcur = acur->cnt; fbinc = false; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 0ed1477187bc..91485b642471 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -523,7 +523,6 @@ xfs_allocbt_init_common( mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); } - cur->bc_ag.abt.active = false; cur->bc_ag.pag = xfs_perag_hold(pag); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 17a0324a304e..b36530e56df9 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -236,9 +236,6 @@ struct xfs_btree_cur_ag { unsigned int nr_ops; /* # record updates */ unsigned int shape_changes; /* # of extent splits */ } refc; - struct { - bool active; /* allocation cursor state */ - } abt; }; }; @@ -321,6 +318,9 @@ xfs_btree_cur_sizeof(unsigned int nlevels) /* For extent swap, ignore owner check in verifier (only for bmbt btrees) */ #define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2) +/* Cursor is active (only for allocbt btrees) */ +#define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3) + #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 -- cgit v1.2.3 From d8d6df4253adcdb5862a9410d962e9168b973c88 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:15 -0800 Subject: xfs: extern some btree ops structures Expose these static btree ops structures so that we can reference them in the AG initialization code in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc_btree.c | 4 ++-- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 4 ++-- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- fs/xfs/libxfs/xfs_shared.h | 9 +++++++++ 6 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 91485b642471..a983c4fe8390 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -454,7 +454,7 @@ xfs_allocbt_keys_contiguous( be32_to_cpu(key2->alloc.ar_startblock)); } -static const struct xfs_btree_ops xfs_bnobt_ops = { +const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -477,7 +477,7 @@ static const struct xfs_btree_ops xfs_bnobt_ops = { .keys_contiguous = xfs_allocbt_keys_contiguous, }; -static const struct xfs_btree_ops xfs_cntbt_ops = { +const struct xfs_btree_ops xfs_cntbt_ops = { .geom_flags = XFS_BTGEO_LASTREC_UPDATE, .rec_len = sizeof(xfs_alloc_rec_t), diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 48617627bc83..1f229e59ec25 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -510,7 +510,7 @@ xfs_bmbt_keys_contiguous( be64_to_cpu(key2->bmbt.br_startoff)); } -static const struct xfs_btree_ops xfs_bmbt_ops = { +const struct xfs_btree_ops xfs_bmbt_ops = { .geom_flags = XFS_BTGEO_LONG_PTRS | XFS_BTGEO_ROOT_IN_INODE, .rec_len = sizeof(xfs_bmbt_rec_t), diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 3e65f028f3ee..69086fdc3be6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -398,7 +398,7 @@ xfs_inobt_keys_contiguous( be32_to_cpu(key2->inobt.ir_startino)); } -static const struct xfs_btree_ops xfs_inobt_ops = { +const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), @@ -420,7 +420,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = { .keys_contiguous = xfs_inobt_keys_contiguous, }; -static const struct xfs_btree_ops xfs_finobt_ops = { +const struct xfs_btree_ops xfs_finobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 6a3a827dd366..36e7b26d5e3b 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -317,7 +317,7 @@ xfs_refcountbt_keys_contiguous( be32_to_cpu(key2->refc.rc_startblock)); } -static const struct xfs_btree_ops xfs_refcountbt_ops = { +const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 776ab6b2be31..086576626f5b 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -472,7 +472,7 @@ xfs_rmapbt_keys_contiguous( be32_to_cpu(key2->rmap.rm_startblock)); } -static const struct xfs_btree_ops xfs_rmapbt_ops = { +const struct xfs_btree_ops xfs_rmapbt_ops = { .geom_flags = XFS_BTGEO_OVERLAPPING, .rec_len = sizeof(struct xfs_rmap_rec), diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 4220d3584c1b..518ea9456eba 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -43,6 +43,15 @@ extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; +/* btree ops */ +extern const struct xfs_btree_ops xfs_bnobt_ops; +extern const struct xfs_btree_ops xfs_cntbt_ops; +extern const struct xfs_btree_ops xfs_inobt_ops; +extern const struct xfs_btree_ops xfs_finobt_ops; +extern const struct xfs_btree_ops xfs_bmbt_ops; +extern const struct xfs_btree_ops xfs_refcountbt_ops; +extern const struct xfs_btree_ops xfs_rmapbt_ops; + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); -- cgit v1.2.3 From c87e3bf7802477cb4500dfafe0ab039313aa2dda Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:16 -0800 Subject: xfs: initialize btree blocks using btree_ops structure Notice now that the btree ops structure encodes btree geometry flags and the magic number through the buffer ops. Refactor the btree block initialization functions to use the btree ops so that we no longer have to open code all that. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 33 +++++++++-------------- fs/xfs/libxfs/xfs_ag.h | 2 +- fs/xfs/libxfs/xfs_bmap.c | 8 ++---- fs/xfs/libxfs/xfs_bmap_btree.c | 20 +++++++++++--- fs/xfs/libxfs/xfs_bmap_btree.h | 3 +++ fs/xfs/libxfs/xfs_btree.c | 57 ++++++++++++++++----------------------- fs/xfs/libxfs/xfs_btree.h | 28 ++++++------------- fs/xfs/libxfs/xfs_btree_staging.c | 5 ++-- 8 files changed, 69 insertions(+), 87 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index fadd00011237..b7366832cb0e 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -492,7 +492,7 @@ xfs_btroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno); + xfs_btree_init_block(mp, bp, id->bc_ops, 0, 0, id->agno); } /* Finish initializing a free space btree. */ @@ -550,7 +550,7 @@ xfs_freesp_init_recs( } /* - * Alloc btree root block init functions + * bnobt/cntbt btree root block init functions */ static void xfs_bnoroot_init( @@ -558,17 +558,7 @@ xfs_bnoroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno); - xfs_freesp_init_recs(mp, bp, id); -} - -static void -xfs_cntroot_init( - struct xfs_mount *mp, - struct xfs_buf *bp, - struct aghdr_init_data *id) -{ - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno); + xfs_btree_init_block(mp, bp, id->bc_ops, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } @@ -584,7 +574,7 @@ xfs_rmaproot_init( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_rmap_rec *rrec; - xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno); + xfs_btree_init_block(mp, bp, id->bc_ops, 0, 4, id->agno); /* * mark the AG header regions as static metadata The BNO @@ -797,7 +787,7 @@ struct xfs_aghdr_grow_data { size_t numblks; const struct xfs_buf_ops *ops; aghdr_init_work_f work; - xfs_btnum_t type; + const struct xfs_btree_ops *bc_ops; bool need_init; }; @@ -851,13 +841,15 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_bnobt_buf_ops, .work = &xfs_bnoroot_init, + .bc_ops = &xfs_bnobt_ops, .need_init = true }, { /* CNT root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_cntbt_buf_ops, - .work = &xfs_cntroot_init, + .work = &xfs_bnoroot_init, + .bc_ops = &xfs_cntbt_ops, .need_init = true }, { /* INO root block */ @@ -865,7 +857,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_inobt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_INO, + .bc_ops = &xfs_inobt_ops, .need_init = true }, { /* FINO root block */ @@ -873,7 +865,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_FINO, + .bc_ops = &xfs_finobt_ops, .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ @@ -881,6 +873,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, + .bc_ops = &xfs_rmapbt_ops, .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ @@ -888,7 +881,7 @@ xfs_ag_init_headers( .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, - .type = XFS_BTNUM_REFC, + .bc_ops = &xfs_refcountbt_ops, .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ @@ -906,7 +899,7 @@ xfs_ag_init_headers( id->daddr = dp->daddr; id->numblks = dp->numblks; - id->type = dp->type; + id->bc_ops = dp->bc_ops; error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); if (error) break; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 4b343c4fac28..77c0fa2bb510 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -331,7 +331,7 @@ struct aghdr_init_data { /* per header data */ xfs_daddr_t daddr; /* header location */ size_t numblks; /* size of header */ - xfs_btnum_t type; /* type of btree root block */ + const struct xfs_btree_ops *bc_ops; /* btree ops */ }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2b10198c8c03..995b9ea55cd4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -644,9 +644,7 @@ xfs_bmap_extents_to_btree( * Fill in the root. */ block = ifp->if_broot; - xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, - XFS_BTNUM_BMAP, 1, 1, ip->i_ino, - XFS_BTGEO_LONG_PTRS); + xfs_bmbt_init_block(ip, block, NULL, 1, 1); /* * Need a cursor. Can't allocate until bb_level is filled in. */ @@ -692,9 +690,7 @@ xfs_bmap_extents_to_btree( */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp), - XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTGEO_LONG_PTRS); + xfs_bmbt_init_block(ip, ablock, abp, 0, 0); for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1f229e59ec25..f9de93499bce 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -26,6 +26,22 @@ static struct kmem_cache *xfs_bmbt_cur_cache; +void +xfs_bmbt_init_block( + struct xfs_inode *ip, + struct xfs_btree_block *buf, + struct xfs_buf *bp, + __u16 level, + __u16 numrecs) +{ + if (bp) + xfs_btree_init_block(ip->i_mount, bp, &xfs_bmbt_ops, level, + numrecs, ip->i_ino); + else + xfs_btree_init_block_int(ip->i_mount, buf, &xfs_bmbt_ops, + XFS_BUF_DADDR_NULL, level, numrecs, ip->i_ino); +} + /* * Convert on-disk form of btree root to in-memory form. */ @@ -44,9 +60,7 @@ xfs_bmdr_to_bmbt( xfs_bmbt_key_t *tkp; __be64 *tpp; - xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, - XFS_BTNUM_BMAP, 0, 0, ip->i_ino, - XFS_BTGEO_LONG_PTRS); + xfs_bmbt_init_block(ip, rblock, NULL, 0, 0); rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 151b8491f60e..e93aa42e2bf5 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -120,4 +120,7 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void); int __init xfs_bmbt_init_cur_cache(void); void xfs_bmbt_destroy_cur_cache(void); +void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf, + struct xfs_buf *bp, __u16 level, __u16 numrecs); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2ba597c9ac52..ed5d485b609b 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -32,24 +32,17 @@ /* * Btree magic numbers. */ -static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { - { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, - XFS_FIBT_MAGIC, 0 }, - { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC, - XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC, - XFS_REFC_CRC_MAGIC } -}; - uint32_t xfs_btree_magic( - int crc, - xfs_btnum_t btnum) + struct xfs_mount *mp, + const struct xfs_btree_ops *ops) { - uint32_t magic = xfs_magics[crc][btnum]; + int idx = xfs_has_crc(mp) ? 1 : 0; + __be32 magic = ops->buf_ops->magic[idx]; /* Ensure we asked for crc for crc-only magics. */ ASSERT(magic != 0); - return magic; + return be32_to_cpu(magic); } /* @@ -128,8 +121,7 @@ __xfs_btree_check_lblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_has_crc(mp); + bool crc = xfs_has_crc(mp); xfs_failaddr_t fa; xfs_fsblock_t fsb = NULLFSBLOCK; @@ -143,7 +135,7 @@ __xfs_btree_check_lblock( return __this_address; } - if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; @@ -197,8 +189,7 @@ __xfs_btree_check_sblock( { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_has_crc(mp); + bool crc = xfs_has_crc(mp); xfs_failaddr_t fa; xfs_agblock_t agbno = NULLAGBLOCK; @@ -210,7 +201,7 @@ __xfs_btree_check_sblock( return __this_address; } - if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum)) + if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops)) return __this_address; if (be16_to_cpu(block->bb_level) != level) return __this_address; @@ -1166,21 +1157,20 @@ void xfs_btree_init_block_int( struct xfs_mount *mp, struct xfs_btree_block *buf, + const struct xfs_btree_ops *ops, xfs_daddr_t blkno, - xfs_btnum_t btnum, __u16 level, __u16 numrecs, - __u64 owner, - unsigned int geom_flags) + __u64 owner) { bool crc = xfs_has_crc(mp); - __u32 magic = xfs_btree_magic(crc, btnum); + __u32 magic = xfs_btree_magic(mp, ops); buf->bb_magic = cpu_to_be32(magic); buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); - if (geom_flags & XFS_BTGEO_LONG_PTRS) { + if (ops->geom_flags & XFS_BTGEO_LONG_PTRS) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (crc) { @@ -1207,15 +1197,15 @@ xfs_btree_init_block_int( void xfs_btree_init_block( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner) + struct xfs_mount *mp, + struct xfs_buf *bp, + const struct xfs_btree_ops *ops, + __u16 level, + __u16 numrecs, + __u64 owner) { - xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), - btnum, level, numrecs, owner, 0); + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), ops, + xfs_buf_daddr(bp), level, numrecs, owner); } void @@ -1238,9 +1228,8 @@ xfs_btree_init_block_cur( else owner = cur->bc_ag.pag->pag_agno; - xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), - xfs_buf_daddr(bp), cur->bc_btnum, level, - numrecs, owner, cur->bc_ops->geom_flags); + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), cur->bc_ops, + xfs_buf_daddr(bp), level, numrecs, owner); } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index b36530e56df9..923f884fe526 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -63,7 +63,8 @@ union xfs_btree_rec { #define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) #define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) -uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); +struct xfs_btree_ops; +uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); /* * For logging record fields. @@ -434,25 +435,12 @@ xfs_btree_reada_bufs( /* * Initialise a new btree block header */ -void -xfs_btree_init_block( - struct xfs_mount *mp, - struct xfs_buf *bp, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner); - -void -xfs_btree_init_block_int( - struct xfs_mount *mp, - struct xfs_btree_block *buf, - xfs_daddr_t blkno, - xfs_btnum_t btnum, - __u16 level, - __u16 numrecs, - __u64 owner, - unsigned int geom_flags); +void xfs_btree_init_block(struct xfs_mount *mp, struct xfs_buf *bp, + const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, + __u64 owner); +void xfs_btree_init_block_int(struct xfs_mount *mp, + struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, + xfs_daddr_t blkno, __u16 level, __u16 numrecs, __u64 owner); /* * Common btree core entry points. diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 5785e87f7880..1ebed1c3a4e4 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -411,9 +411,8 @@ xfs_btree_bload_prep_block( /* Initialize it and send it out. */ xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, - XFS_BUF_DADDR_NULL, cur->bc_btnum, level, - nr_this_block, cur->bc_ino.ip->i_ino, - cur->bc_ops->geom_flags); + cur->bc_ops, XFS_BUF_DADDR_NULL, level, + nr_this_block, cur->bc_ino.ip->i_ino); *bpp = NULL; *blockp = ifp->if_broot; -- cgit v1.2.3 From 3c68858b264fac292f74733eeaf558595978a5e5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:17 -0800 Subject: xfs: rename btree block/buffer init functions Rename xfs_btree_init_block_int to xfs_btree_init_block, and xfs_btree_init_block to xfs_btree_init_buf so that the name suggests the type that caller are supposed to pass in. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 6 +++--- fs/xfs/libxfs/xfs_bmap_btree.c | 4 ++-- fs/xfs/libxfs/xfs_btree.c | 8 ++++---- fs/xfs/libxfs/xfs_btree.h | 4 ++-- fs/xfs/libxfs/xfs_btree_staging.c | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index b7366832cb0e..446733893764 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -492,7 +492,7 @@ xfs_btroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, id->bc_ops, 0, 0, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno); } /* Finish initializing a free space btree. */ @@ -558,7 +558,7 @@ xfs_bnoroot_init( struct xfs_buf *bp, struct aghdr_init_data *id) { - xfs_btree_init_block(mp, bp, id->bc_ops, 0, 0, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno); xfs_freesp_init_recs(mp, bp, id); } @@ -574,7 +574,7 @@ xfs_rmaproot_init( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_rmap_rec *rrec; - xfs_btree_init_block(mp, bp, id->bc_ops, 0, 4, id->agno); + xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 4, id->agno); /* * mark the AG header regions as static metadata The BNO diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index f9de93499bce..ddaf40e0f309 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -35,10 +35,10 @@ xfs_bmbt_init_block( __u16 numrecs) { if (bp) - xfs_btree_init_block(ip->i_mount, bp, &xfs_bmbt_ops, level, + xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level, numrecs, ip->i_ino); else - xfs_btree_init_block_int(ip->i_mount, buf, &xfs_bmbt_ops, + xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, XFS_BUF_DADDR_NULL, level, numrecs, ip->i_ino); } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ed5d485b609b..29abac7b3e03 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1154,7 +1154,7 @@ xfs_btree_set_sibling( } void -xfs_btree_init_block_int( +xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, @@ -1196,7 +1196,7 @@ xfs_btree_init_block_int( } void -xfs_btree_init_block( +xfs_btree_init_buf( struct xfs_mount *mp, struct xfs_buf *bp, const struct xfs_btree_ops *ops, @@ -1204,7 +1204,7 @@ xfs_btree_init_block( __u16 numrecs, __u64 owner) { - xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), ops, + xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, xfs_buf_daddr(bp), level, numrecs, owner); } @@ -1228,7 +1228,7 @@ xfs_btree_init_block_cur( else owner = cur->bc_ag.pag->pag_agno; - xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), cur->bc_ops, + xfs_btree_init_block(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), cur->bc_ops, xfs_buf_daddr(bp), level, numrecs, owner); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 923f884fe526..56901d2591ed 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -435,10 +435,10 @@ xfs_btree_reada_bufs( /* * Initialise a new btree block header */ -void xfs_btree_init_block(struct xfs_mount *mp, struct xfs_buf *bp, +void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp, const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs, __u64 owner); -void xfs_btree_init_block_int(struct xfs_mount *mp, +void xfs_btree_init_block(struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, xfs_daddr_t blkno, __u16 level, __u16 numrecs, __u64 owner); diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 1ebed1c3a4e4..b590f4580a6d 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -410,7 +410,7 @@ xfs_btree_bload_prep_block( ifp->if_broot_bytes = (int)new_size; /* Initialize it and send it out. */ - xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot, + xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops, XFS_BUF_DADDR_NULL, level, nr_this_block, cur->bc_ino.ip->i_ino); -- cgit v1.2.3 From 7771f7030007e3faa6906864d01b504b590e1ca2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:18 -0800 Subject: xfs: btree convert xfs_btree_init_block to xfs_btree_init_buf calls Convert any place we call xfs_btree_init_block with a buffer to use the _init_buf function. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 29abac7b3e03..f1db5118dbaa 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1228,8 +1228,7 @@ xfs_btree_init_block_cur( else owner = cur->bc_ag.pag->pag_agno; - xfs_btree_init_block(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), cur->bc_ops, - xfs_buf_daddr(bp), level, numrecs, owner); + xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, owner); } /* -- cgit v1.2.3 From 11388f6581f40e7d5a69ce5f8b13264eca7c2c5c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:19 -0800 Subject: xfs: remove the unnecessary daddr paramter to _init_block Now that all of the callers pass XFS_BUF_DADDR_NULL as the daddr parameter, we can elide that too. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap_btree.c | 4 ++-- fs/xfs/libxfs/xfs_btree.c | 19 ++++++++++++++++--- fs/xfs/libxfs/xfs_btree.h | 2 +- fs/xfs/libxfs/xfs_btree_staging.c | 5 ++--- 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index ddaf40e0f309..fa116c2d13ae 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -38,8 +38,8 @@ xfs_bmbt_init_block( xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level, numrecs, ip->i_ino); else - xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, - XFS_BUF_DADDR_NULL, level, numrecs, ip->i_ino); + xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, level, + numrecs, ip->i_ino); } /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f1db5118dbaa..b5b9b021c67a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1153,8 +1153,8 @@ xfs_btree_set_sibling( } } -void -xfs_btree_init_block( +static void +__xfs_btree_init_block( struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, @@ -1195,6 +1195,19 @@ xfs_btree_init_block( } } +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_btree_block *block, + const struct xfs_btree_ops *ops, + __u16 level, + __u16 numrecs, + __u64 owner) +{ + __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level, + numrecs, owner); +} + void xfs_btree_init_buf( struct xfs_mount *mp, @@ -1204,7 +1217,7 @@ xfs_btree_init_buf( __u16 numrecs, __u64 owner) { - xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, + __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, xfs_buf_daddr(bp), level, numrecs, owner); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 56901d2591ed..80be40ca8954 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -440,7 +440,7 @@ void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp, __u64 owner); void xfs_btree_init_block(struct xfs_mount *mp, struct xfs_btree_block *buf, const struct xfs_btree_ops *ops, - xfs_daddr_t blkno, __u16 level, __u16 numrecs, __u64 owner); + __u16 level, __u16 numrecs, __u64 owner); /* * Common btree core entry points. diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index b590f4580a6d..e31e4bd21718 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -410,9 +410,8 @@ xfs_btree_bload_prep_block( ifp->if_broot_bytes = (int)new_size; /* Initialize it and send it out. */ - xfs_btree_init_block(cur->bc_mp, ifp->if_broot, - cur->bc_ops, XFS_BUF_DADDR_NULL, level, - nr_this_block, cur->bc_ino.ip->i_ino); + xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops, + level, nr_this_block, cur->bc_ino.ip->i_ino); *bpp = NULL; *blockp = ifp->if_broot; -- cgit v1.2.3 From ad065ef0d2fcd787225bd8887b6b75c6eb4da9a1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:19 -0800 Subject: xfs: set btree block buffer ops in _init_buf Set the btree block buffer ops in xfs_btree_init_buf since we already have access to that information through the btree ops. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 1 - fs/xfs/libxfs/xfs_btree.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 995b9ea55cd4..c196fd0a3d7e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -688,7 +688,6 @@ xfs_bmap_extents_to_btree( /* * Fill in the child block. */ - abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); xfs_bmbt_init_block(ip, ablock, abp, 0, 0); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b5b9b021c67a..e945620b2203 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1219,6 +1219,7 @@ xfs_btree_init_buf( { __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops, xfs_buf_daddr(bp), level, numrecs, owner); + bp->b_ops = ops->buf_ops; } void -- cgit v1.2.3 From 90cfae818dac5227e94e21d0f5250e098432723e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:20 -0800 Subject: xfs: move lru refs to the btree ops structure Move the btree buffer LRU refcount to the btree ops structure so that we can eliminate the last bc_btnum switch in the generic btree code. We're about to create repair-specific btree types, and we don't want that stuff cluttering up libxfs. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc_btree.c | 4 ++++ fs/xfs/libxfs/xfs_bmap_btree.c | 2 ++ fs/xfs/libxfs/xfs_btree.c | 24 ++---------------------- fs/xfs/libxfs/xfs_btree.h | 3 +++ fs/xfs/libxfs/xfs_ialloc_btree.c | 4 ++++ fs/xfs/libxfs/xfs_refcount_btree.c | 2 ++ fs/xfs/libxfs/xfs_rmap_btree.c | 2 ++ 7 files changed, 19 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a983c4fe8390..045c7954ef1b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -458,6 +458,8 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .lru_refs = XFS_ALLOC_BTREE_REF, + .dup_cursor = xfs_allocbt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, @@ -483,6 +485,8 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .lru_refs = XFS_ALLOC_BTREE_REF, + .dup_cursor = xfs_allocbt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fa116c2d13ae..43dc9fa6b26f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -530,6 +530,8 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), + .lru_refs = XFS_BMAP_BTREE_REF, + .dup_cursor = xfs_bmbt_dup_cursor, .update_cursor = xfs_bmbt_update_cursor, .alloc_block = xfs_bmbt_alloc_block, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e945620b2203..e1e96f4f4052 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1284,32 +1284,12 @@ xfs_btree_buf_to_ptr( } } -STATIC void +static inline void xfs_btree_set_refs( struct xfs_btree_cur *cur, struct xfs_buf *bp) { - switch (cur->bc_btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); - break; - case XFS_BTNUM_INO: - case XFS_BTNUM_FINO: - xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); - break; - case XFS_BTNUM_BMAP: - xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); - break; - case XFS_BTNUM_RMAP: - xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF); - break; - case XFS_BTNUM_REFC: - xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF); - break; - default: - ASSERT(0); - } + xfs_buf_set_ref(bp, cur->bc_ops->lru_refs); } int diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 80be40ca8954..39df108a32ef 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -120,6 +120,9 @@ struct xfs_btree_ops { size_t key_len; size_t rec_len; + /* LRU refcount to set on each btree buffer created */ + unsigned int lru_refs; + /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); void (*update_cursor)(struct xfs_btree_cur *src, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 69086fdc3be6..a2a8c15a6c9e 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -402,6 +402,8 @@ const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .lru_refs = XFS_INO_BTREE_REF, + .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_inobt_set_root, .alloc_block = xfs_inobt_alloc_block, @@ -424,6 +426,8 @@ const struct xfs_btree_ops xfs_finobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .lru_refs = XFS_INO_BTREE_REF, + .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, .alloc_block = xfs_finobt_alloc_block, diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 36e7b26d5e3b..eaed9517e795 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -321,6 +321,8 @@ const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), + .lru_refs = XFS_REFC_BTREE_REF, + .dup_cursor = xfs_refcountbt_dup_cursor, .set_root = xfs_refcountbt_set_root, .alloc_block = xfs_refcountbt_alloc_block, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 086576626f5b..8abe90c25f71 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -478,6 +478,8 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .rec_len = sizeof(struct xfs_rmap_rec), .key_len = 2 * sizeof(struct xfs_rmap_key), + .lru_refs = XFS_RMAP_BTREE_REF, + .dup_cursor = xfs_rmapbt_dup_cursor, .set_root = xfs_rmapbt_set_root, .alloc_block = xfs_rmapbt_alloc_block, -- cgit v1.2.3 From 07b7f2e3172b97da2a7ac273ecbaf173cc09a9f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:35:21 -0800 Subject: xfs: move the btree stats offset into struct btree_ops The statistics offset is completely static, move it into the btree_ops structure instead of the cursor. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 17 +++++++---------- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_btree.h | 10 +++++++--- fs/xfs/libxfs/xfs_ialloc_btree.c | 20 +++++++++----------- fs/xfs/libxfs/xfs_refcount_btree.c | 3 +-- fs/xfs/libxfs/xfs_rmap_btree.c | 3 +-- 6 files changed, 26 insertions(+), 29 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 045c7954ef1b..85ac0ba3c1f1 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -459,6 +459,7 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .key_len = sizeof(xfs_alloc_key_t), .lru_refs = XFS_ALLOC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2), .dup_cursor = xfs_allocbt_dup_cursor, .set_root = xfs_allocbt_set_root, @@ -486,6 +487,7 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .key_len = sizeof(xfs_alloc_key_t), .lru_refs = XFS_ALLOC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2), .dup_cursor = xfs_allocbt_dup_cursor, .set_root = xfs_allocbt_set_root, @@ -514,22 +516,17 @@ xfs_allocbt_init_common( struct xfs_perag *pag, xfs_btnum_t btnum) { + const struct xfs_btree_ops *ops = &xfs_bnobt_ops; struct xfs_btree_cur *cur; ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - if (btnum == XFS_BTNUM_CNT) { - cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_cntbt_ops, - mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); - } else { - cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_bnobt_ops, - mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); - } + if (btnum == XFS_BTNUM_CNT) + ops = &xfs_cntbt_ops; + cur = xfs_btree_alloc_cursor(mp, tp, btnum, ops, mp->m_alloc_maxlevels, + xfs_allocbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); - return cur; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 43dc9fa6b26f..2d7a8a852596 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -531,6 +531,7 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .key_len = sizeof(xfs_bmbt_key_t), .lru_refs = XFS_BMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2), .dup_cursor = xfs_bmbt_dup_cursor, .update_cursor = xfs_bmbt_update_cursor, @@ -564,7 +565,6 @@ xfs_bmbt_init_common( cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, &xfs_bmbt_ops, mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 39df108a32ef..2a1f30a849f5 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -87,9 +87,11 @@ uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); * Generic stats interface */ #define XFS_BTREE_STATS_INC(cur, stat) \ - XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) + XFS_STATS_INC_OFF((cur)->bc_mp, \ + (cur)->bc_ops->statoff + __XBTS_ ## stat) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ - XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) + XFS_STATS_ADD_OFF((cur)->bc_mp, \ + (cur)->bc_ops->statoff + __XBTS_ ## stat, val) enum xbtree_key_contig { XBTREE_KEY_GAP = 0, @@ -123,6 +125,9 @@ struct xfs_btree_ops { /* LRU refcount to set on each btree buffer created */ unsigned int lru_refs; + /* offset of btree stats array */ + unsigned int statoff; + /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); void (*update_cursor)(struct xfs_btree_cur *src, @@ -280,7 +285,6 @@ struct xfs_btree_cur union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ - int bc_statoff; /* offset of btree stats array */ /* * Short btree pointers need an agno to be able to turn the pointers diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index a2a8c15a6c9e..8ac789650106 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -403,6 +403,7 @@ const struct xfs_btree_ops xfs_inobt_ops = { .key_len = sizeof(xfs_inobt_key_t), .lru_refs = XFS_INO_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2), .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_inobt_set_root, @@ -427,6 +428,7 @@ const struct xfs_btree_ops xfs_finobt_ops = { .key_len = sizeof(xfs_inobt_key_t), .lru_refs = XFS_INO_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2), .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, @@ -456,20 +458,16 @@ xfs_inobt_init_common( xfs_btnum_t btnum) /* ialloc or free ino btree */ { struct xfs_mount *mp = pag->pag_mount; + const struct xfs_btree_ops *ops = &xfs_inobt_ops; struct xfs_btree_cur *cur; - if (btnum == XFS_BTNUM_INO) { - cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_inobt_ops, - M_IGEO(mp)->inobt_maxlevels, - xfs_inobt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); - } else { - cur = xfs_btree_alloc_cursor(mp, tp, btnum, &xfs_finobt_ops, - M_IGEO(mp)->inobt_maxlevels, - xfs_inobt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); - } + ASSERT(btnum == XFS_BTNUM_INO || btnum == XFS_BTNUM_FINO); + + if (btnum == XFS_BTNUM_FINO) + ops = &xfs_finobt_ops; + cur = xfs_btree_alloc_cursor(mp, tp, btnum, ops, + M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index eaed9517e795..b12b1ccd1f27 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -322,6 +322,7 @@ const struct xfs_btree_ops xfs_refcountbt_ops = { .key_len = sizeof(struct xfs_refcount_key), .lru_refs = XFS_REFC_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2), .dup_cursor = xfs_refcountbt_dup_cursor, .set_root = xfs_refcountbt_set_root, @@ -357,8 +358,6 @@ xfs_refcountbt_init_common( cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC, &xfs_refcountbt_ops, mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); - cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.refc.nr_ops = 0; cur->bc_ag.refc.shape_changes = 0; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 8abe90c25f71..e0ae6da94fc3 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -479,6 +479,7 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .key_len = 2 * sizeof(struct xfs_rmap_key), .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), .dup_cursor = xfs_rmapbt_dup_cursor, .set_root = xfs_rmapbt_set_root, @@ -509,8 +510,6 @@ xfs_rmapbt_init_common( /* Overlapping btree; 2 keys per pointer. */ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); - cur->bc_ag.pag = xfs_perag_hold(pag); return cur; } -- cgit v1.2.3 From 2054cf051698d30cc9479678c2b807a364248f38 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:22 -0800 Subject: xfs: factor out a xfs_btree_owner helper Split out a helper to calculate the owner for a given btree instead of duplicating the logic in two places. While we're at it, make the bc_ag/bc_ino switch logic depend on the correct geometry flag. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [djwong: break this up into two patches for the owner check] Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e1e96f4f4052..b869d6eb6a0b 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1222,6 +1222,15 @@ xfs_btree_init_buf( bp->b_ops = ops->buf_ops; } +static inline __u64 +xfs_btree_owner( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + return cur->bc_ino.ip->i_ino; + return cur->bc_ag.pag->pag_agno; +} + void xfs_btree_init_block_cur( struct xfs_btree_cur *cur, @@ -1229,20 +1238,8 @@ xfs_btree_init_block_cur( int level, int numrecs) { - __u64 owner; - - /* - * we can pull the owner from the cursor right now as the different - * owners align directly with the pointer size of the btree. This may - * change in future, but is safe for current users of the generic btree - * code. - */ - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) - owner = cur->bc_ino.ip->i_ino; - else - owner = cur->bc_ag.pag->pag_agno; - - xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, owner); + xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs, + xfs_btree_owner(cur)); } /* -- cgit v1.2.3 From 186f20c003199824eb3eb3b78e4eb7c2535a8ffc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:23 -0800 Subject: xfs: factor out a btree block owner check Hoist the btree block owner check into a separate helper so that we don't have an ugly multiline if statement. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b869d6eb6a0b..6d6683d8663f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1777,6 +1777,33 @@ error0: return error; } +/* + * Check the btree block owner now that we have the context to know who the + * real owner is. + */ +static inline xfs_failaddr_t +xfs_btree_check_block_owner( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block) +{ + __u64 owner; + + if (!xfs_has_crc(cur->bc_mp) || + (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER)) + return NULL; + + owner = xfs_btree_owner(cur); + if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) + return __this_address; + } else { + if (be32_to_cpu(block->bb_u.s.bb_owner) != owner) + return __this_address; + } + + return NULL; +} + int xfs_btree_lookup_get_block( struct xfs_btree_cur *cur, /* btree cursor */ @@ -1815,11 +1842,7 @@ xfs_btree_lookup_get_block( return error; /* Check the inode owner since the verifiers don't. */ - if (xfs_has_crc(cur->bc_mp) && - !(cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER) && - (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) && - be64_to_cpu((*blkp)->bb_u.l.bb_owner) != - cur->bc_ino.ip->i_ino) + if (xfs_btree_check_block_owner(cur, *blkp) != NULL) goto out_bad; /* Did we get the level we were looking for? */ -- cgit v1.2.3 From 1a9d26291c68fbb8f8d24f9f694b32223a072745 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:35:36 -0800 Subject: xfs: store the btree pointer length in struct xfs_btree_ops Make the pointer length an explicit field in the btree operations structure so that the next patch (which introduces an explicit btree type enum) doesn't have to play a bunch of awkward games with inferring the pointer length from the enumeration. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc_btree.c | 2 ++ fs/xfs/libxfs/xfs_bmap_btree.c | 3 +- fs/xfs/libxfs/xfs_btree.c | 57 ++++++++++++++++---------------------- fs/xfs/libxfs/xfs_btree.h | 26 ++++++++++------- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 ++ fs/xfs/libxfs/xfs_refcount_btree.c | 1 + fs/xfs/libxfs/xfs_rmap_btree.c | 1 + fs/xfs/scrub/btree.c | 4 +-- fs/xfs/scrub/newbt.c | 2 +- fs/xfs/xfs_trace.h | 4 +-- 10 files changed, 53 insertions(+), 49 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 85ac0ba3c1f1..9b81b4407444 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -457,6 +457,7 @@ xfs_allocbt_keys_contiguous( const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_ALLOC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2), @@ -485,6 +486,7 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_ALLOC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2), diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 2d7a8a852596..59a5f75a45e1 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -525,10 +525,11 @@ xfs_bmbt_keys_contiguous( } const struct xfs_btree_ops xfs_bmbt_ops = { - .geom_flags = XFS_BTGEO_LONG_PTRS | XFS_BTGEO_ROOT_IN_INODE, + .geom_flags = XFS_BTGEO_ROOT_IN_INODE, .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, .lru_refs = XFS_BMAP_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2), diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 6d6683d8663f..7907711322d4 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -252,7 +252,7 @@ xfs_btree_check_block( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block, if any */ { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return xfs_btree_check_lblock(cur, block, level, bp); else return xfs_btree_check_sblock(cur, block, level, bp); @@ -293,7 +293,7 @@ xfs_btree_check_ptr( int index, int level) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), level)) return 0; @@ -449,7 +449,7 @@ xfs_btree_del_cursor( xfs_is_shutdown(cur->bc_mp) || error != 0); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kfree(cur->bc_ops); - if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) && cur->bc_ag.pag) + if (!(cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) && cur->bc_ag.pag) xfs_perag_put(cur->bc_ag.pag); kmem_cache_free(cur->bc_cache, cur); } @@ -588,7 +588,7 @@ xfs_btree_dup_cursor( */ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (xfs_has_crc(cur->bc_mp)) return XFS_BTREE_LBLOCK_CRC_LEN; return XFS_BTREE_LBLOCK_LEN; @@ -598,15 +598,6 @@ static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) return XFS_BTREE_SBLOCK_LEN; } -/* - * Return size of btree block pointers for this btree instance. - */ -static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) -{ - return (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) ? - sizeof(__be64) : sizeof(__be32); -} - /* * Calculate offset of the n-th record in a btree block. */ @@ -654,7 +645,7 @@ xfs_btree_ptr_offset( { return xfs_btree_block_len(cur) + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + - (n - 1) * xfs_btree_ptr_len(cur); + (n - 1) * cur->bc_ops->ptr_len; } /* @@ -1002,7 +993,7 @@ xfs_btree_readahead( cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return xfs_btree_readahead_lblock(cur, lr, block); return xfs_btree_readahead_sblock(cur, lr, block); } @@ -1021,7 +1012,7 @@ xfs_btree_ptr_to_daddr( if (error) return error; - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { fsbno = be64_to_cpu(ptr->l); *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); } else { @@ -1071,7 +1062,7 @@ xfs_btree_setbuf( cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) @@ -1089,7 +1080,7 @@ xfs_btree_ptr_is_null( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return ptr->l == cpu_to_be64(NULLFSBLOCK); else return ptr->s == cpu_to_be32(NULLAGBLOCK); @@ -1100,7 +1091,7 @@ xfs_btree_set_ptr_null( struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(NULLFSBLOCK); else ptr->s = cpu_to_be32(NULLAGBLOCK); @@ -1118,7 +1109,7 @@ xfs_btree_get_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) ptr->l = block->bb_u.l.bb_rightsib; else @@ -1140,7 +1131,7 @@ xfs_btree_set_sibling( { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (lr == XFS_BB_RIGHTSIB) block->bb_u.l.bb_rightsib = ptr->l; else @@ -1170,7 +1161,7 @@ __xfs_btree_init_block( buf->bb_level = cpu_to_be16(level); buf->bb_numrecs = cpu_to_be16(numrecs); - if (ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK); buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK); if (crc) { @@ -1272,7 +1263,7 @@ xfs_btree_buf_to_ptr( struct xfs_buf *bp, union xfs_btree_ptr *ptr) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp))); else { @@ -1387,7 +1378,7 @@ xfs_btree_copy_ptrs( int numptrs) { ASSERT(numptrs >= 0); - memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); + memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len); } /* @@ -1443,8 +1434,8 @@ xfs_btree_shift_ptrs( ASSERT(numptrs >= 0); ASSERT(dir == 1 || dir == -1); - dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); - memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); + dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len); + memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len); } /* @@ -1570,7 +1561,7 @@ xfs_btree_log_block( nbits = XFS_BB_NUM_BITS; } xfs_btree_offsets(fields, - (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) ? + (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ? loffsets : soffsets, nbits, &first, &last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); @@ -1793,7 +1784,7 @@ xfs_btree_check_block_owner( return NULL; owner = xfs_btree_owner(cur); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (be64_to_cpu(block->bb_u.l.bb_owner) != owner) return __this_address; } else { @@ -3052,7 +3043,7 @@ xfs_btree_new_iroot( memcpy(cblock, block, xfs_btree_block_len(cur)); if (xfs_has_crc(cur->bc_mp)) { __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) cblock->bb_u.l.bb_blkno = bno; else cblock->bb_u.s.bb_blkno = bno; @@ -4411,7 +4402,7 @@ xfs_btree_visit_block( * return the same block without checking if the right sibling points * back to us and creates a cyclic reference in the btree. */ - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp))) { xfs_btree_mark_sick(cur); @@ -4519,7 +4510,7 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); @@ -5068,7 +5059,7 @@ xfs_btree_diff_two_ptrs( const union xfs_btree_ptr *a, const union xfs_btree_ptr *b) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l); return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s); } @@ -5216,7 +5207,7 @@ xfs_btree_has_more_records( return true; /* There are more record blocks. */ - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); else return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 2a1f30a849f5..559066e3ac12 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -114,13 +114,17 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) return XBTREE_KEY_OVERLAP; } +#define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64)) +#define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32)) + struct xfs_btree_ops { /* XFS_BTGEO_* flags that determine the geometry of the btree */ unsigned int geom_flags; - /* size of the key and record structures */ - size_t key_len; - size_t rec_len; + /* size of the key, pointer, and record structures */ + size_t key_len; + size_t ptr_len; + size_t rec_len; /* LRU refcount to set on each btree buffer created */ unsigned int lru_refs; @@ -212,10 +216,9 @@ struct xfs_btree_ops { }; /* btree geometry flags */ -#define XFS_BTGEO_LONG_PTRS (1U << 0) /* pointers are 64bits long */ -#define XFS_BTGEO_ROOT_IN_INODE (1U << 1) /* root may be variable size */ -#define XFS_BTGEO_LASTREC_UPDATE (1U << 2) /* track last rec externally */ -#define XFS_BTGEO_OVERLAPPING (1U << 3) /* overlapping intervals */ +#define XFS_BTGEO_ROOT_IN_INODE (1U << 0) /* root may be variable size */ +#define XFS_BTGEO_LASTREC_UPDATE (1U << 1) /* track last rec externally */ +#define XFS_BTGEO_OVERLAPPING (1U << 2) /* overlapping intervals */ /* * Reasons for the update_lastrec method to be called. @@ -289,8 +292,8 @@ struct xfs_btree_cur /* * Short btree pointers need an agno to be able to turn the pointers * into physical addresses for IO, so the btree cursor switches between - * bc_ino and bc_ag based on whether XFS_BTGEO_LONG_PTRS is set for the - * cursor. + * bc_ino and bc_ag based on whether XFS_BTGEO_ROOT_IN_INODE is set for + * the cursor. */ union { struct xfs_btree_cur_ag bc_ag; @@ -689,7 +692,7 @@ xfs_btree_islastblock( block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK); return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } @@ -725,6 +728,9 @@ xfs_btree_alloc_cursor( { struct xfs_btree_cur *cur; + ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN || + ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN); + /* BMBT allocations can come through from non-transactional context. */ cur = kmem_cache_zalloc(cache, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 8ac789650106..964d05f9c5cf 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -401,6 +401,7 @@ xfs_inobt_keys_contiguous( const struct xfs_btree_ops xfs_inobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_INO_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2), @@ -426,6 +427,7 @@ const struct xfs_btree_ops xfs_inobt_ops = { const struct xfs_btree_ops xfs_finobt_ops = { .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_INO_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2), diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index b12b1ccd1f27..fb05bc2adc25 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -320,6 +320,7 @@ xfs_refcountbt_keys_contiguous( const struct xfs_btree_ops xfs_refcountbt_ops = { .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_REFC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2), diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index e0ae6da94fc3..71df1d7d0e01 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -477,6 +477,7 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .rec_len = sizeof(struct xfs_rmap_rec), .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_SHORT_PTR_LEN, .lru_refs = XFS_RMAP_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index e882919996c4..3d3f82007ac6 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -244,7 +244,7 @@ xchk_btree_ptr_ok( return true; /* Otherwise, check the pointers. */ - if (bs->cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (bs->cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); else res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); @@ -602,7 +602,7 @@ xchk_btree_get_block( return error; xfs_btree_get_block(bs->cur, level, pbp); - if (bs->cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (bs->cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, level, *pbp); else diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 84267be79dc1..608d7ab01d89 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -535,7 +535,7 @@ xrep_newbt_claim_block( trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, xnr->oinfo.oi_owner); - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, agbno)); else diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 921f4b6eee89..0e7620216976 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2520,7 +2520,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->btnum = cur->bc_btnum; __entry->error = error; if (!error && stat) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { xfs_fsblock_t fsb = be64_to_cpu(ptr->l); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, @@ -4262,7 +4262,7 @@ TRACE_EVENT(xfs_btree_bload_block, __entry->level = level; __entry->block_idx = block_idx; __entry->nr_blocks = nr_blocks; - if (cur->bc_ops->geom_flags & XFS_BTGEO_LONG_PTRS) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { xfs_fsblock_t fsb = be64_to_cpu(ptr->l); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb); -- cgit v1.2.3 From 4f0cd5a555072e21fb589975607b70798e073f8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:36:17 -0800 Subject: xfs: split out a btree type from the btree ops geometry flags Two of the btree cursor flags are always used together and encode the fundamental btree type. There currently are two such types: 1) an on-disk AG-rooted btree with 32-bit pointers 2) an on-disk inode-rooted btree with 64-bit pointers and we're about to add: 3) an in-memory btree with 64-bit pointers Introduce a new enum and a new type field in struct xfs_btree_geom to encode this type directly instead of using flags and change most code to switch on this enum. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [djwong: make the pointer lengths explicit] Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 3 ++ fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_btree.c | 66 +++++++++++++++++++++----------------- fs/xfs/libxfs/xfs_btree.h | 15 ++++++--- fs/xfs/libxfs/xfs_btree_staging.c | 12 +++---- fs/xfs/libxfs/xfs_btree_staging.h | 3 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 4 +++ fs/xfs/libxfs/xfs_refcount_btree.c | 2 ++ fs/xfs/libxfs/xfs_rmap_btree.c | 1 + fs/xfs/scrub/btree.c | 12 +++---- fs/xfs/scrub/trace.c | 2 +- fs/xfs/xfs_trace.h | 4 +-- 12 files changed, 74 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 9b81b4407444..7d9798535dba 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -455,6 +455,8 @@ xfs_allocbt_keys_contiguous( } const struct xfs_btree_ops xfs_bnobt_ops = { + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, @@ -482,6 +484,7 @@ const struct xfs_btree_ops xfs_bnobt_ops = { }; const struct xfs_btree_ops xfs_cntbt_ops = { + .type = XFS_BTREE_TYPE_AG, .geom_flags = XFS_BTGEO_LASTREC_UPDATE, .rec_len = sizeof(xfs_alloc_rec_t), diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 59a5f75a45e1..90ab644b98b4 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -525,7 +525,7 @@ xfs_bmbt_keys_contiguous( } const struct xfs_btree_ops xfs_bmbt_ops = { - .geom_flags = XFS_BTGEO_ROOT_IN_INODE, + .type = XFS_BTREE_TYPE_INODE, .rec_len = sizeof(xfs_bmbt_rec_t), .key_len = sizeof(xfs_bmbt_key_t), diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 7907711322d4..e5d4b435a60a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -447,10 +447,19 @@ xfs_btree_del_cursor( */ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); + + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + if (cur->bc_ag.pag) + xfs_perag_put(cur->bc_ag.pag); + break; + case XFS_BTREE_TYPE_INODE: + /* nothing to do */ + break; + } + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kfree(cur->bc_ops); - if (!(cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) && cur->bc_ag.pag) - xfs_perag_put(cur->bc_ag.pag); kmem_cache_free(cur->bc_cache, cur); } @@ -708,7 +717,7 @@ struct xfs_ifork * xfs_btree_ifork_ptr( struct xfs_btree_cur *cur) { - ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; @@ -740,8 +749,8 @@ xfs_btree_get_block( int level, /* level in btree */ struct xfs_buf **bpp) /* buffer containing the block */ { - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && + level == cur->bc_nlevels - 1) { *bpp = NULL; return xfs_btree_get_iroot(cur); } @@ -983,8 +992,8 @@ xfs_btree_readahead( * No readahead needed if we are at the root level and the * btree root is stored in the inode. */ - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && - (lev == cur->bc_nlevels - 1)) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && + lev == cur->bc_nlevels - 1) return 0; if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) @@ -1172,14 +1181,12 @@ __xfs_btree_init_block( buf->bb_u.l.bb_lsn = 0; } } else { - /* owner is a 32 bit value on short blocks */ - __u32 __owner = (__u32)owner; - buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); if (crc) { buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); - buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + /* owner is a 32 bit value on short blocks */ + buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner); uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid); buf->bb_u.s.bb_lsn = 0; } @@ -1217,7 +1224,7 @@ static inline __u64 xfs_btree_owner( struct xfs_btree_cur *cur) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) return cur->bc_ino.ip->i_ino; return cur->bc_ag.pag->pag_agno; } @@ -1638,7 +1645,7 @@ xfs_btree_increment( * confused or have the tree root in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); @@ -1732,7 +1739,7 @@ xfs_btree_decrement( * or the root of the tree is in an inode. */ if (lev == cur->bc_nlevels) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) goto out0; ASSERT(0); xfs_btree_mark_sick(cur); @@ -1807,8 +1814,8 @@ xfs_btree_lookup_get_block( int error = 0; /* special case the root block if in an inode */ - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && + level == cur->bc_nlevels - 1) { *blkp = xfs_btree_get_iroot(cur); return 0; } @@ -2343,7 +2350,7 @@ xfs_btree_lshift( int error; /* error return value */ int i; - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && + if ((cur->bc_ops->type == XFS_BTREE_TYPE_INODE) && level == cur->bc_nlevels - 1) goto out0; @@ -2539,8 +2546,8 @@ xfs_btree_rshift( int error; /* error return value */ int i; /* loop counter */ - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && - (level == cur->bc_nlevels - 1)) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && + level == cur->bc_nlevels - 1) goto out0; /* Set up variables for this block as "left". */ @@ -2990,7 +2997,6 @@ xfs_btree_split( #define xfs_btree_split __xfs_btree_split #endif /* __KERNEL__ */ - /* * Copy the old inode root contents into a real block and make the * broot point to it. @@ -3015,7 +3021,7 @@ xfs_btree_new_iroot( XFS_BTREE_STATS_INC(cur, newroot); - ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); level = cur->bc_nlevels - 1; @@ -3240,7 +3246,7 @@ xfs_btree_make_block_unfull( { int error = 0; - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 1) { struct xfs_inode *ip = cur->bc_ino.ip; @@ -3326,8 +3332,8 @@ xfs_btree_insrec( * If we have an external root pointer, and we've made it to the * root level, allocate a new root block and we're done. */ - if (!(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && - (level >= cur->bc_nlevels)) { + if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE && + level >= cur->bc_nlevels) { error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); @@ -3614,7 +3620,7 @@ xfs_btree_kill_iroot( #endif int i; - ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_nlevels > 1); /* @@ -3851,7 +3857,7 @@ xfs_btree_delrec( * nothing left to do. */ if (level == cur->bc_nlevels - 1) { - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); @@ -3919,7 +3925,7 @@ xfs_btree_delrec( xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * One child of root, need to get a chance to copy its contents * into the root and delete it. Can't go up to next level, @@ -4236,8 +4242,8 @@ xfs_btree_delrec( * If we joined with the right neighbor and there's a level above * us, increment the cursor at that level. */ - else if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) || - (level + 1 < cur->bc_nlevels)) { + else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE || + level + 1 < cur->bc_nlevels) { error = xfs_btree_increment(cur, level + 1, &i); if (error) goto error0; @@ -4528,7 +4534,7 @@ xfs_btree_block_change_owner( * though, so everything is consistent in memory. */ if (!bp) { - ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(level == cur->bc_nlevels - 1); return 0; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 559066e3ac12..5f2b5ef858ee 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -117,7 +117,15 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) #define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64)) #define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32)) +enum xfs_btree_type { + XFS_BTREE_TYPE_AG, + XFS_BTREE_TYPE_INODE, +}; + struct xfs_btree_ops { + /* Type of btree - AG-rooted or inode-rooted */ + enum xfs_btree_type type; + /* XFS_BTGEO_* flags that determine the geometry of the btree */ unsigned int geom_flags; @@ -216,9 +224,8 @@ struct xfs_btree_ops { }; /* btree geometry flags */ -#define XFS_BTGEO_ROOT_IN_INODE (1U << 0) /* root may be variable size */ -#define XFS_BTGEO_LASTREC_UPDATE (1U << 1) /* track last rec externally */ -#define XFS_BTGEO_OVERLAPPING (1U << 2) /* overlapping intervals */ +#define XFS_BTGEO_LASTREC_UPDATE (1U << 0) /* track last rec externally */ +#define XFS_BTGEO_OVERLAPPING (1U << 1) /* overlapping intervals */ /* * Reasons for the update_lastrec method to be called. @@ -292,7 +299,7 @@ struct xfs_btree_cur /* * Short btree pointers need an agno to be able to turn the pointers * into physical addresses for IO, so the btree cursor switches between - * bc_ino and bc_ag based on whether XFS_BTGEO_ROOT_IN_INODE is set for + * bc_ino and bc_ag based on bc_ops->type. * the cursor. */ union { diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index e31e4bd21718..2fef3fd4d634 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -136,7 +136,7 @@ xfs_btree_stage_afakeroot( struct xfs_btree_ops *nops; ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(!(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE)); + ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); @@ -217,7 +217,7 @@ xfs_btree_stage_ifakeroot( struct xfs_btree_ops *nops; ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); - ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE); + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); @@ -397,7 +397,7 @@ xfs_btree_bload_prep_block( struct xfs_btree_block *new_block; int ret; - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); size_t new_size; @@ -702,7 +702,7 @@ xfs_btree_bload_compute_geometry( xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &level_blocks, &dontcare64); - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { /* * If all the items we want to store at this level * would fit in the inode root block, then we have our @@ -761,7 +761,7 @@ xfs_btree_bload_compute_geometry( return -EOVERFLOW; bbl->btree_height = cur->bc_nlevels; - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) bbl->nr_blocks = nr_blocks - 1; else bbl->nr_blocks = nr_blocks; @@ -888,7 +888,7 @@ xfs_btree_bload( } /* Initialize the new root. */ - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); cur->bc_ino.ifake->if_levels = cur->bc_nlevels; cur->bc_ino.ifake->if_blocks = total_blocks - 1; diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 9624ae06c83c..8e29cd3cc0f1 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -76,8 +76,7 @@ struct xfs_btree_bload { /* * This function should return the size of the in-core btree root - * block. It is only necessary for XFS_BTGEO_ROOT_IN_INODE btree - * types. + * block. It is only necessary for XFS_BTREE_TYPE_INODE btrees. */ xfs_btree_bload_iroot_size_fn iroot_size; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 964d05f9c5cf..fc584424ebdf 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -399,6 +399,8 @@ xfs_inobt_keys_contiguous( } const struct xfs_btree_ops xfs_inobt_ops = { + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, @@ -425,6 +427,8 @@ const struct xfs_btree_ops xfs_inobt_ops = { }; const struct xfs_btree_ops xfs_finobt_ops = { + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(xfs_inobt_rec_t), .key_len = sizeof(xfs_inobt_key_t), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index fb05bc2adc25..52f3e4a30ecc 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -318,6 +318,8 @@ xfs_refcountbt_keys_contiguous( } const struct xfs_btree_ops xfs_refcountbt_ops = { + .type = XFS_BTREE_TYPE_AG, + .rec_len = sizeof(struct xfs_refcount_rec), .key_len = sizeof(struct xfs_refcount_key), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 71df1d7d0e01..62efcfaa4173 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -473,6 +473,7 @@ xfs_rmapbt_keys_contiguous( } const struct xfs_btree_ops xfs_rmapbt_ops = { + .type = XFS_BTREE_TYPE_AG, .geom_flags = XFS_BTGEO_OVERLAPPING, .rec_len = sizeof(struct xfs_rmap_rec), diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 3d3f82007ac6..71cfb2a45468 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -47,7 +47,7 @@ __xchk_btree_process_error( *error = 0; fallthrough; default: - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_op_error(sc, cur, level, *error, ret_ip); else @@ -91,7 +91,7 @@ __xchk_btree_set_corrupt( { sc->sm->sm_flags |= errflag; - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) trace_xchk_ifork_btree_error(sc, cur, level, ret_ip); else @@ -239,7 +239,7 @@ xchk_btree_ptr_ok( bool res; /* A btree rooted in an inode has no block pointer to the root. */ - if ((bs->cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && + if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == bs->cur->bc_nlevels) return true; @@ -390,7 +390,7 @@ xchk_btree_check_block_owner( * sc->sa so that we can check for the presence of an ownership record * in the rmap btree for the AG containing the block. */ - init_sa = bs->cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE; + init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG; if (init_sa) { error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, @@ -434,7 +434,7 @@ xchk_btree_check_owner( * up. */ if (bp == NULL) { - if (!(cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE)) + if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE) xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } @@ -513,7 +513,7 @@ xchk_btree_check_minrecs( * child block might be less than the standard minrecs, but that's ok * provided that there's only one direct child of the root. */ - if ((cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) && + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == cur->bc_nlevels - 2) { struct xfs_btree_block *root_block; struct xfs_buf *root_bp; diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index a7b0d95b8d5d..dc8a331f4b02 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -37,7 +37,7 @@ xchk_btree_cur_fsbno( xfs_buf_daddr(cur->bc_levels[level].bp)); if (level == cur->bc_nlevels - 1 && - (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE)) + cur->bc_ops->type == XFS_BTREE_TYPE_INODE) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); return NULLFSBLOCK; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0e7620216976..9d0d7085b079 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2510,7 +2510,7 @@ TRACE_EVENT(xfs_btree_alloc_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { __entry->agno = 0; __entry->ino = cur->bc_ino.ip->i_ino; } else { @@ -2557,7 +2557,7 @@ TRACE_EVENT(xfs_btree_free_block, __entry->dev = cur->bc_mp->m_super->s_dev; __entry->agno = xfs_daddr_to_agno(cur->bc_mp, xfs_buf_daddr(bp)); - if (cur->bc_ops->geom_flags & XFS_BTGEO_ROOT_IN_INODE) + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) __entry->ino = cur->bc_ino.ip->i_ino; else __entry->ino = 0; -- cgit v1.2.3 From 88ee2f4849119b82b95d6e8e2d9daa81214eb080 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:37:03 -0800 Subject: xfs: split the per-btree union in struct xfs_btree_cur Split up the union that encodes btree-specific fields in struct xfs_btree_cur. Most fields in there are specific to the btree type encoded in xfs_btree_ops.type, and we can use the obviously named union for that. But one field is specific to the bmapbt and two are shared by the refcount and rtrefcountbt. Move those to a separate union to make the usage clear and not need a separate struct for the refcount-related fields. This will also make unnecessary some very awkward btree cursor refc/rtrefc switching logic in the rtrefcount patchset. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 22 +++++++-------- fs/xfs/libxfs/xfs_bmap_btree.c | 9 +++---- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/libxfs/xfs_btree.h | 55 ++++++++++++++++---------------------- fs/xfs/libxfs/xfs_btree_staging.c | 1 + fs/xfs/libxfs/xfs_refcount.c | 24 ++++++++--------- fs/xfs/libxfs/xfs_refcount_btree.c | 4 +-- 7 files changed, 54 insertions(+), 63 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c196fd0a3d7e..6a374a6e8586 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -676,7 +676,7 @@ xfs_bmap_extents_to_btree( goto out_root_realloc; } - cur->bc_ino.allocated++; + cur->bc_bmap.allocated++; ip->i_nblocks++; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, @@ -894,7 +894,7 @@ xfs_bmap_add_attrfork_btree( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return -ENOSPC; } - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); } return 0; @@ -922,7 +922,7 @@ xfs_bmap_add_attrfork_extents( error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, XFS_DATA_FORK); if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -1746,7 +1746,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_ino.allocated : 0)); + (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startoff = new_endoff; PREV.br_blockcount = temp; @@ -1836,7 +1836,7 @@ xfs_bmap_add_extent_delay_real( temp = PREV.br_blockcount - new->br_blockcount; da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), startblockval(PREV.br_startblock) - - (bma->cur ? bma->cur->bc_ino.allocated : 0)); + (bma->cur ? bma->cur->bc_bmap.allocated : 0)); PREV.br_startblock = nullstartblock(da_new); PREV.br_blockcount = temp; @@ -1959,8 +1959,8 @@ xfs_bmap_add_extent_delay_real( xfs_mod_delalloc(mp, (int64_t)da_new - da_old); if (bma->cur) { - da_new += bma->cur->bc_ino.allocated; - bma->cur->bc_ino.allocated = 0; + da_new += bma->cur->bc_bmap.allocated; + bma->cur->bc_bmap.allocated = 0; } /* adjust for changes in reserved delayed indirect blocks */ @@ -2525,7 +2525,7 @@ xfs_bmap_add_extent_unwritten_real( /* clear out the allocated field, done with it now in any case. */ if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; *curp = cur; } @@ -2913,7 +2913,7 @@ xfs_bmap_add_extent_hole_real( /* clear out the allocated field, done with it now in any case. */ if (cur) - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: @@ -5629,7 +5629,7 @@ error0: xfs_trans_log_inode(tp, ip, logflags); if (cur) { if (!error) - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } return error; @@ -6145,7 +6145,7 @@ xfs_bmap_split_extent( del_cursor: if (cur) { - cur->bc_ino.allocated = 0; + cur->bc_bmap.allocated = 0; xfs_btree_del_cursor(cur, error); } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 90ab644b98b4..5dad3db4affa 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -198,10 +198,10 @@ xfs_bmbt_update_cursor( ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) || (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME)); - dst->bc_ino.allocated += src->bc_ino.allocated; + dst->bc_bmap.allocated += src->bc_bmap.allocated; dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno; - src->bc_ino.allocated = 0; + src->bc_bmap.allocated = 0; } STATIC int @@ -256,7 +256,7 @@ xfs_bmbt_alloc_block( } ASSERT(args.len == 1); - cur->bc_ino.allocated++; + cur->bc_bmap.allocated++; cur->bc_ino.ip->i_nblocks++; xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE); xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip, @@ -568,8 +568,7 @@ xfs_bmbt_init_common( mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); cur->bc_ino.ip = ip; - cur->bc_ino.allocated = 0; - + cur->bc_bmap.allocated = 0; return cur; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e5d4b435a60a..6de646e7a65c 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -445,7 +445,7 @@ xfs_btree_del_cursor( * zero, then we should be shut down or on our way to shutdown due to * cancelling a dirty transaction on error. */ - ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || + ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); switch (cur->bc_ops->type) { diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 5f2b5ef858ee..153d867259f3 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -243,30 +243,6 @@ union xfs_btree_irec { struct xfs_refcount_irec rc; }; -/* Per-AG btree information. */ -struct xfs_btree_cur_ag { - struct xfs_perag *pag; - union { - struct xfs_buf *agbp; - struct xbtree_afakeroot *afake; /* for staging cursor */ - }; - union { - struct { - unsigned int nr_ops; /* # record updates */ - unsigned int shape_changes; /* # of extent splits */ - } refc; - }; -}; - -/* Btree-in-inode cursor information */ -struct xfs_btree_cur_ino { - struct xfs_inode *ip; - struct xbtree_ifakeroot *ifake; /* for staging cursor */ - int allocated; - short forksize; - char whichfork; -}; - struct xfs_btree_level { /* buffer pointer */ struct xfs_buf *bp; @@ -296,15 +272,30 @@ struct xfs_btree_cur uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ - /* - * Short btree pointers need an agno to be able to turn the pointers - * into physical addresses for IO, so the btree cursor switches between - * bc_ino and bc_ag based on bc_ops->type. - * the cursor. - */ + /* per-type information */ + union { + struct { + struct xfs_inode *ip; + short forksize; + char whichfork; + struct xbtree_ifakeroot *ifake; /* for staging cursor */ + } bc_ino; + struct { + struct xfs_perag *pag; + struct xfs_buf *agbp; + struct xbtree_afakeroot *afake; /* for staging cursor */ + } bc_ag; + }; + + /* per-format private data */ union { - struct xfs_btree_cur_ag bc_ag; - struct xfs_btree_cur_ino bc_ino; + struct { + int allocated; + } bc_bmap; /* bmapbt */ + struct { + unsigned int nr_ops; /* # record updates */ + unsigned int shape_changes; /* # of extent splits */ + } bc_refc; /* refcountbt */ }; /* Must be at the end of the struct! */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 2fef3fd4d634..0d2d7f2bea00 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -172,6 +172,7 @@ xfs_btree_commit_afakeroot( trace_xfs_btree_commit_afakeroot(cur); kfree((void *)cur->bc_ops); + cur->bc_ag.afake = NULL; cur->bc_ag.agbp = agbp; cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index db394125d783..511c912d515c 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1077,7 +1077,7 @@ xfs_refcount_still_have_space( * to handle each of the shape changes to the refcount btree. */ overhead = xfs_allocfree_block_count(cur->bc_mp, - cur->bc_ag.refc.shape_changes); + cur->bc_refc.shape_changes); overhead += cur->bc_mp->m_refc_maxlevels; overhead *= cur->bc_mp->m_sb.sb_blocksize; @@ -1085,17 +1085,17 @@ xfs_refcount_still_have_space( * Only allow 2 refcount extent updates per transaction if the * refcount continue update "error" has been injected. */ - if (cur->bc_ag.refc.nr_ops > 2 && + if (cur->bc_refc.nr_ops > 2 && XFS_TEST_ERROR(false, cur->bc_mp, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE)) return false; - if (cur->bc_ag.refc.nr_ops == 0) + if (cur->bc_refc.nr_ops == 0) return true; else if (overhead > cur->bc_tp->t_log_res) return false; - return cur->bc_tp->t_log_res - overhead > - cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; + return cur->bc_tp->t_log_res - overhead > + cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD; } /* @@ -1155,7 +1155,7 @@ xfs_refcount_adjust_extents( * Either cover the hole (increment) or * delete the range (decrement). */ - cur->bc_ag.refc.nr_ops++; + cur->bc_refc.nr_ops++; if (tmp.rc_refcount) { error = xfs_refcount_insert(cur, &tmp, &found_tmp); @@ -1216,7 +1216,7 @@ xfs_refcount_adjust_extents( ext.rc_refcount += adj; trace_xfs_refcount_modify_extent(cur->bc_mp, cur->bc_ag.pag->pag_agno, &ext); - cur->bc_ag.refc.nr_ops++; + cur->bc_refc.nr_ops++; if (ext.rc_refcount > 1) { error = xfs_refcount_update(cur, &ext); if (error) @@ -1305,7 +1305,7 @@ xfs_refcount_adjust( if (shape_changed) shape_changes++; if (shape_changes) - cur->bc_ag.refc.shape_changes++; + cur->bc_refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj); @@ -1400,8 +1400,8 @@ xfs_refcount_finish_one( */ rcur = *pcur; if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) { - nr_ops = rcur->bc_ag.refc.nr_ops; - shape_changes = rcur->bc_ag.refc.shape_changes; + nr_ops = rcur->bc_refc.nr_ops; + shape_changes = rcur->bc_refc.shape_changes; xfs_refcount_finish_one_cleanup(tp, rcur, 0); rcur = NULL; *pcur = NULL; @@ -1413,8 +1413,8 @@ xfs_refcount_finish_one( return error; rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag); - rcur->bc_ag.refc.nr_ops = nr_ops; - rcur->bc_ag.refc.shape_changes = shape_changes; + rcur->bc_refc.nr_ops = nr_ops; + rcur->bc_refc.shape_changes = shape_changes; } *pcur = rcur; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 52f3e4a30ecc..2eb94f18ff33 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -362,8 +362,8 @@ xfs_refcountbt_init_common( &xfs_refcountbt_ops, mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); - cur->bc_ag.refc.nr_ops = 0; - cur->bc_ag.refc.shape_changes = 0; + cur->bc_refc.nr_ops = 0; + cur->bc_refc.shape_changes = 0; return cur; } -- cgit v1.2.3 From f73def90a7cd24a32a42f689efba6a7a35edeb7b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:37:24 -0800 Subject: xfs: create predicate to determine if cursor is at inode root level Create a predicate to decide if the given cursor and level point to the root block in the inode immediate area instead of a disk block, and get rid of the open-coded logic everywhere. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 52 +++++++++++++++++---------------------- fs/xfs/libxfs/xfs_btree.h | 10 ++++++++ fs/xfs/libxfs/xfs_btree_staging.c | 3 +-- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 6de646e7a65c..5d0b1084cba8 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -749,8 +749,7 @@ xfs_btree_get_block( int level, /* level in btree */ struct xfs_buf **bpp) /* buffer containing the block */ { - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { *bpp = NULL; return xfs_btree_get_iroot(cur); } @@ -992,8 +991,7 @@ xfs_btree_readahead( * No readahead needed if we are at the root level and the * btree root is stored in the inode. */ - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && - lev == cur->bc_nlevels - 1) + if (xfs_btree_at_iroot(cur, lev)) return 0; if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) @@ -1814,8 +1812,7 @@ xfs_btree_lookup_get_block( int error = 0; /* special case the root block if in an inode */ - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { *blkp = xfs_btree_get_iroot(cur); return 0; } @@ -2350,8 +2347,7 @@ xfs_btree_lshift( int error; /* error return value */ int i; - if ((cur->bc_ops->type == XFS_BTREE_TYPE_INODE) && - level == cur->bc_nlevels - 1) + if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "right". */ @@ -2546,8 +2542,7 @@ xfs_btree_rshift( int error; /* error return value */ int i; /* loop counter */ - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && - level == cur->bc_nlevels - 1) + if (xfs_btree_at_iroot(cur, level)) goto out0; /* Set up variables for this block as "left". */ @@ -3246,8 +3241,7 @@ xfs_btree_make_block_unfull( { int error = 0; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { struct xfs_inode *ip = cur->bc_ino.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { @@ -3856,27 +3850,25 @@ xfs_btree_delrec( * Try to get rid of the next level down. If we can't then there's * nothing left to do. */ - if (level == cur->bc_nlevels - 1) { - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { - xfs_iroot_realloc(cur->bc_ino.ip, -1, - cur->bc_ino.whichfork); + if (xfs_btree_at_iroot(cur, level)) { + xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork); - error = xfs_btree_kill_iroot(cur); - if (error) - goto error0; + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; - error = xfs_btree_dec_cursor(cur, level, stat); - if (error) - goto error0; - *stat = 1; - return 0; - } + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } - /* - * If this is the root level, and there's only one entry left, - * and it's NOT the leaf level, then we can get rid of this - * level. - */ + /* + * If this is the root level, and there's only one entry left, and it's + * NOT the leaf level, then we can get rid of this level. + */ + if (level == cur->bc_nlevels - 1) { if (numrecs == 1 && level > 0) { union xfs_btree_ptr *pp; /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 153d867259f3..07abc56e0395 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -747,4 +747,14 @@ void xfs_btree_destroy_cur_caches(void); int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); +/* Does this level of the cursor point to the inode root (and not a block)? */ +static inline bool +xfs_btree_at_iroot( + const struct xfs_btree_cur *cur, + int level) +{ + return cur->bc_ops->type == XFS_BTREE_TYPE_INODE && + level == cur->bc_nlevels - 1; +} + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 0d2d7f2bea00..a0dbe9b9b1c6 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -398,8 +398,7 @@ xfs_btree_bload_prep_block( struct xfs_btree_block *new_block; int ret; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE && - level == cur->bc_nlevels - 1) { + if (xfs_btree_at_iroot(cur, level)) { struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); size_t new_size; -- cgit v1.2.3 From 72c2070f3f52196a2e8b4efced94390b62eb8ac4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:37:25 -0800 Subject: xfs: move comment about two 2 keys per pointer in the rmap btree Move it to the relevant initialization of the ops structure instead of a place that has nothing to do with the key size. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 62efcfaa4173..4fdbd6368a03 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -477,6 +477,7 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .geom_flags = XFS_BTGEO_OVERLAPPING, .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ .key_len = 2 * sizeof(struct xfs_rmap_key), .ptr_len = XFS_BTREE_SHORT_PTR_LEN, @@ -509,7 +510,6 @@ xfs_rmapbt_init_common( { struct xfs_btree_cur *cur; - /* Overlapping btree; 2 keys per pointer. */ cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); -- cgit v1.2.3 From f9c18129e57df7b33f4257340840525816481da6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:37:26 -0800 Subject: xfs: add a xfs_btree_init_ptr_from_cur Inode-rooted btrees don't need to initialize the root pointer in the ->init_ptr_from_cur method as the root is found by the xfs_btree_get_iroot method later. Make ->init_ptr_from_cur option for inode rooted btrees by providing a helper that does the right thing for the given btree type and also documents the semantics. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap_btree.c | 9 --------- fs/xfs/libxfs/xfs_btree.c | 27 +++++++++++++++++++++++---- fs/xfs/libxfs/xfs_btree.h | 2 ++ fs/xfs/libxfs/xfs_btree_staging.c | 1 - fs/xfs/scrub/btree.c | 2 +- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 5dad3db4affa..726cb506bbbf 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -369,14 +369,6 @@ xfs_bmbt_init_rec_from_cur( xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); } -STATIC void -xfs_bmbt_init_ptr_from_cur( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - ptr->l = 0; -} - STATIC int64_t xfs_bmbt_key_diff( struct xfs_btree_cur *cur, @@ -544,7 +536,6 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .init_key_from_rec = xfs_bmbt_init_key_from_rec, .init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec, .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, - .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, .key_diff = xfs_bmbt_key_diff, .diff_two_keys = xfs_bmbt_diff_two_keys, .buf_ops = &xfs_bmbt_buf_ops, diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 5d0b1084cba8..7c4f842c5f91 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1881,6 +1881,25 @@ xfs_lookup_get_search_key( return xfs_btree_key_addr(cur, keyno, block); } +/* + * Initialize a pointer to the root block. + */ +void +xfs_btree_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + /* + * Inode-rooted btrees call xfs_btree_get_iroot to find the root + * in xfs_btree_lookup_get_block and don't need a pointer here. + */ + ptr->l = 0; + } else { + cur->bc_ops->init_ptr_from_cur(cur, ptr); + } +} + /* * Lookup the record. The cursor is made to point to it, based on dir. * stat is set to 0 if can't find any such record, 1 for success. @@ -1911,7 +1930,7 @@ xfs_btree_lookup( keyno = 0; /* initialise start pointer from cursor */ - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); pp = &ptr; /* @@ -3121,7 +3140,7 @@ xfs_btree_new_root( XFS_BTREE_STATS_INC(cur, newroot); /* initialise our start point from the cursor */ - cur->bc_ops->init_ptr_from_cur(cur, &rptr); + xfs_btree_init_ptr_from_cur(cur, &rptr); /* Allocate the new block. If we can't do it, we're toast. Give up. */ error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat); @@ -4430,7 +4449,7 @@ xfs_btree_visit_blocks( struct xfs_btree_block *block = NULL; int error = 0; - cur->bc_ops->init_ptr_from_cur(cur, &lptr); + xfs_btree_init_ptr_from_cur(cur, &lptr); /* for each level */ for (level = cur->bc_nlevels - 1; level >= 0; level--) { @@ -4852,7 +4871,7 @@ xfs_btree_overlapped_query_range( /* Load the root of the btree. */ level = cur->bc_nlevels - 1; - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); error = xfs_btree_lookup_get_block(cur, level, &ptr, &block); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 07abc56e0395..99194ae94694 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -714,6 +714,8 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, void xfs_btree_copy_keys(struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys); +void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); static inline struct xfs_btree_cur * xfs_btree_alloc_cursor( diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index a0dbe9b9b1c6..36d8c5841956 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -225,7 +225,6 @@ xfs_btree_stage_ifakeroot( memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); nops->alloc_block = xfs_btree_fakeroot_alloc_block; nops->free_block = xfs_btree_fakeroot_free_block; - nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; cur->bc_ino.ifake = ifake; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 71cfb2a45468..1ec3339755b9 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -733,7 +733,7 @@ xchk_btree( * error codes for us. */ level = cur->bc_nlevels - 1; - cur->bc_ops->init_ptr_from_cur(cur, &ptr); + xfs_btree_init_ptr_from_cur(cur, &ptr); if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr)) goto out; error = xchk_btree_get_block(bs, level, &ptr, &block, &bp); -- cgit v1.2.3 From 2b9e7f2668c540f18afd66a053ea78f3a629f8e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:37:35 -0800 Subject: xfs: don't override bc_ops for staging btrees Add a few conditionals for staging btrees to the core btree code instead of overloading the bc_ops vector. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 6 +- fs/xfs/libxfs/xfs_bmap_btree.c | 6 +- fs/xfs/libxfs/xfs_btree.c | 75 +++++++++++++++++++------ fs/xfs/libxfs/xfs_btree_staging.c | 109 +------------------------------------ fs/xfs/libxfs/xfs_btree_staging.h | 7 +-- fs/xfs/libxfs/xfs_ialloc_btree.c | 4 +- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- 8 files changed, 72 insertions(+), 139 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 7d9798535dba..75c66dca61eb 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -594,11 +594,7 @@ xfs_allocbt_commit_staged_btree( agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); - if (cur->bc_btnum == XFS_BTNUM_BNO) { - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops); - } else { - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops); - } + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in an alloc btree block. */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 726cb506bbbf..ec0b970157ae 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -609,7 +609,6 @@ xfs_bmbt_stage_cursor( struct xbtree_ifakeroot *ifake) { struct xfs_btree_cur *cur; - struct xfs_btree_ops *ops; /* data fork always has larger maxheight */ cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); @@ -618,8 +617,7 @@ xfs_bmbt_stage_cursor( /* Don't let anyone think we're attached to the real fork yet. */ cur->bc_ino.whichfork = -1; - xfs_btree_stage_ifakeroot(cur, ifake, &ops); - ops->update_cursor = NULL; + xfs_btree_stage_ifakeroot(cur, ifake); return cur; } @@ -663,7 +661,7 @@ xfs_bmbt_commit_staged_btree( break; } xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); - xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops); + xfs_btree_commit_ifakeroot(cur, tp, whichfork); } /* diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 7c4f842c5f91..2649f24ed748 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -407,6 +407,15 @@ xfs_btree_free_block( trace_xfs_btree_free_block(cur, bp); + /* + * Don't allow block freeing for a staging cursor, because staging + * cursors do not support regular btree modifications. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } + error = cur->bc_ops->free_block(cur, bp); if (!error) { xfs_trans_binval(cur->bc_tp, bp); @@ -458,8 +467,6 @@ xfs_btree_del_cursor( break; } - if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) - kfree(cur->bc_ops); kmem_cache_free(cur->bc_cache, cur); } @@ -467,20 +474,26 @@ xfs_btree_del_cursor( * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. */ -int /* error */ +int /* error */ xfs_btree_dup_cursor( - struct xfs_btree_cur *cur, /* input cursor */ - struct xfs_btree_cur **ncur) /* output cursor */ + struct xfs_btree_cur *cur, /* input cursor */ + struct xfs_btree_cur **ncur) /* output cursor */ { - struct xfs_buf *bp; /* btree block's buffer pointer */ - int error; /* error return value */ - int i; /* level number of btree block */ - xfs_mount_t *mp; /* mount structure for filesystem */ - struct xfs_btree_cur *new; /* new cursor value */ - xfs_trans_t *tp; /* transaction pointer, can be NULL */ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_trans *tp = cur->bc_tp; + struct xfs_buf *bp; + struct xfs_btree_cur *new; + int error; + int i; - tp = cur->bc_tp; - mp = cur->bc_mp; + /* + * Don't allow staging cursors to be duplicated because they're supposed + * to be kept private to a single thread. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } /* * Allocate a new cursor like the old one. @@ -1895,6 +1908,8 @@ xfs_btree_init_ptr_from_cur( * in xfs_btree_lookup_get_block and don't need a pointer here. */ ptr->l = 0; + } else if (cur->bc_flags & XFS_BTREE_STAGING) { + ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root); } else { cur->bc_ops->init_ptr_from_cur(cur, ptr); } @@ -2716,6 +2731,18 @@ xfs_btree_alloc_block( { int error; + /* + * Don't allow block allocation for a staging cursor, because staging + * cursors do not support regular btree modifications. + * + * Bulk loading uses a separate callback to obtain new blocks from a + * preallocated list, which prevents ENOSPC failures during loading. + */ + if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) { + ASSERT(0); + return -EFSCORRUPTED; + } + error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat); trace_xfs_btree_alloc_block(cur, new_block, *stat, error); return error; @@ -3116,6 +3143,21 @@ error0: return error; } +static void +xfs_btree_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + if (cur->bc_flags & XFS_BTREE_STAGING) { + /* Update the btree root information for a per-AG fake root. */ + cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s); + cur->bc_ag.afake->af_levels += inc; + } else { + cur->bc_ops->set_root(cur, ptr, inc); + } +} + /* * Allocate a new root block, fill it in. */ @@ -3156,7 +3198,7 @@ xfs_btree_new_root( goto error0; /* Set the root in the holding structure increasing the level by 1. */ - cur->bc_ops->set_root(cur, &lptr, 1); + xfs_btree_set_root(cur, &lptr, 1); /* * At the previous root level there are now two blocks: the old root, @@ -3584,7 +3626,8 @@ xfs_btree_insert( if (pcur != cur && (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { /* Save the state from the cursor before we trash it */ - if (cur->bc_ops->update_cursor) + if (cur->bc_ops->update_cursor && + !(cur->bc_flags & XFS_BTREE_STAGING)) cur->bc_ops->update_cursor(pcur, cur); cur->bc_nlevels = pcur->bc_nlevels; xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); @@ -3727,7 +3770,7 @@ xfs_btree_kill_root( * Update the root pointer, decreasing the level by 1 and then * free the old root. */ - cur->bc_ops->set_root(cur, newroot, -1); + xfs_btree_set_root(cur, newroot, -1); error = xfs_btree_free_block(cur, bp); if (error) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 36d8c5841956..185d6c3bbc02 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -38,63 +38,6 @@ * specific btree type to commit the new btree into the filesystem. */ -/* - * Don't allow staging cursors to be duplicated because they're supposed to be - * kept private to a single thread. - */ -STATIC struct xfs_btree_cur * -xfs_btree_fakeroot_dup_cursor( - struct xfs_btree_cur *cur) -{ - ASSERT(0); - return NULL; -} - -/* - * Don't allow block allocation for a staging cursor, because staging cursors - * do not support regular btree modifications. - * - * Bulk loading uses a separate callback to obtain new blocks from a - * preallocated list, which prevents ENOSPC failures during loading. - */ -STATIC int -xfs_btree_fakeroot_alloc_block( - struct xfs_btree_cur *cur, - const union xfs_btree_ptr *start_bno, - union xfs_btree_ptr *new_bno, - int *stat) -{ - ASSERT(0); - return -EFSCORRUPTED; -} - -/* - * Don't allow block freeing for a staging cursor, because staging cursors - * do not support regular btree modifications. - */ -STATIC int -xfs_btree_fakeroot_free_block( - struct xfs_btree_cur *cur, - struct xfs_buf *bp) -{ - ASSERT(0); - return -EFSCORRUPTED; -} - -/* Initialize a pointer to the root block from the fakeroot. */ -STATIC void -xfs_btree_fakeroot_init_ptr_from_cur( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) -{ - struct xbtree_afakeroot *afake; - - ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - - afake = cur->bc_ag.afake; - ptr->s = cpu_to_be32(afake->af_root); -} - /* * Bulk Loading for AG Btrees * ========================== @@ -109,47 +52,20 @@ xfs_btree_fakeroot_init_ptr_from_cur( * cursor into a regular btree cursor. */ -/* Update the btree root information for a per-AG fake root. */ -STATIC void -xfs_btree_afakeroot_set_root( - struct xfs_btree_cur *cur, - const union xfs_btree_ptr *ptr, - int inc) -{ - struct xbtree_afakeroot *afake = cur->bc_ag.afake; - - ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - afake->af_root = be32_to_cpu(ptr->s); - afake->af_levels += inc; -} - /* * Initialize a AG-rooted btree cursor with the given AG btree fake root. - * The btree cursor's bc_ops will be overridden as needed to make the staging - * functionality work. */ void xfs_btree_stage_afakeroot( struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake) { - struct xfs_btree_ops *nops; - ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); - memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); - nops->alloc_block = xfs_btree_fakeroot_alloc_block; - nops->free_block = xfs_btree_fakeroot_free_block; - nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur; - nops->set_root = xfs_btree_afakeroot_set_root; - nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; - cur->bc_ag.afake = afake; cur->bc_nlevels = afake->af_levels; - cur->bc_ops = nops; cur->bc_flags |= XFS_BTREE_STAGING; } @@ -163,18 +79,15 @@ void xfs_btree_commit_afakeroot( struct xfs_btree_cur *cur, struct xfs_trans *tp, - struct xfs_buf *agbp, - const struct xfs_btree_ops *ops) + struct xfs_buf *agbp) { ASSERT(cur->bc_flags & XFS_BTREE_STAGING); ASSERT(cur->bc_tp == NULL); trace_xfs_btree_commit_afakeroot(cur); - kfree((void *)cur->bc_ops); cur->bc_ag.afake = NULL; cur->bc_ag.agbp = agbp; - cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; cur->bc_tp = tp; } @@ -212,28 +125,15 @@ xfs_btree_commit_afakeroot( void xfs_btree_stage_ifakeroot( struct xfs_btree_cur *cur, - struct xbtree_ifakeroot *ifake, - struct xfs_btree_ops **new_ops) + struct xbtree_ifakeroot *ifake) { - struct xfs_btree_ops *nops; - ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); ASSERT(cur->bc_tp == NULL); - nops = kmalloc(sizeof(struct xfs_btree_ops), GFP_KERNEL | __GFP_NOFAIL); - memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops)); - nops->alloc_block = xfs_btree_fakeroot_alloc_block; - nops->free_block = xfs_btree_fakeroot_free_block; - nops->dup_cursor = xfs_btree_fakeroot_dup_cursor; - cur->bc_ino.ifake = ifake; cur->bc_nlevels = ifake->if_levels; - cur->bc_ops = nops; cur->bc_flags |= XFS_BTREE_STAGING; - - if (new_ops) - *new_ops = nops; } /* @@ -246,18 +146,15 @@ void xfs_btree_commit_ifakeroot( struct xfs_btree_cur *cur, struct xfs_trans *tp, - int whichfork, - const struct xfs_btree_ops *ops) + int whichfork) { ASSERT(cur->bc_flags & XFS_BTREE_STAGING); ASSERT(cur->bc_tp == NULL); trace_xfs_btree_commit_ifakeroot(cur); - kfree((void *)cur->bc_ops); cur->bc_ino.ifake = NULL; cur->bc_ino.whichfork = whichfork; - cur->bc_ops = ops; cur->bc_flags &= ~XFS_BTREE_STAGING; cur->bc_tp = tp; } diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 8e29cd3cc0f1..0c9c2ffb127a 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -22,7 +22,7 @@ struct xbtree_afakeroot { void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur, struct xbtree_afakeroot *afake); void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, - struct xfs_buf *agbp, const struct xfs_btree_ops *ops); + struct xfs_buf *agbp); /* Fake root for an inode-rooted btree. */ struct xbtree_ifakeroot { @@ -41,10 +41,9 @@ struct xbtree_ifakeroot { /* Cursor interactions with fake roots for inode-rooted btrees. */ void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur, - struct xbtree_ifakeroot *ifake, - struct xfs_btree_ops **new_ops); + struct xbtree_ifakeroot *ifake); void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, - int whichfork, const struct xfs_btree_ops *ops); + int whichfork); /* Bulk loading of staged btrees. */ typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index fc584424ebdf..0d04ac32367d 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -537,7 +537,7 @@ xfs_inobt_commit_staged_btree( fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } else { fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); @@ -547,7 +547,7 @@ xfs_inobt_commit_staged_btree( fields |= XFS_AGI_IBLOCKS; } xfs_ialloc_log_agi(tp, agbp, fields); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 2eb94f18ff33..966e87db2403 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -419,7 +419,7 @@ xfs_refcountbt_commit_staged_btree( xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS | XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in a refcount btree block. */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 4fdbd6368a03..84eb767425cb 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -567,7 +567,7 @@ xfs_rmapbt_commit_staged_btree( agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | XFS_AGF_RMAP_BLOCKS); - xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); + xfs_btree_commit_afakeroot(cur, tp, agbp); } /* Calculate number of records in a reverse mapping btree block. */ -- cgit v1.2.3 From fb518f8eeb90197624b21a3429e57b6a65bff7bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:36 -0800 Subject: xfs: fold xfs_allocbt_init_common into xfs_allocbt_init_cursor Make the levels initialization in xfs_allocbt_init_cursor conditional and merge the two helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 42 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 75c66dca61eb..847674658d67 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -513,11 +513,16 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .keys_contiguous = NULL, /* not needed right now */ }; -/* Allocate most of a new allocation btree cursor. */ -STATIC struct xfs_btree_cur * -xfs_allocbt_init_common( +/* + * Allocate a new allocation btree cursor. + * + * For staging cursors tp and agbp are NULL. + */ +struct xfs_btree_cur * +xfs_allocbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_buf *agbp, struct xfs_perag *pag, xfs_btnum_t btnum) { @@ -532,31 +537,14 @@ xfs_allocbt_init_common( cur = xfs_btree_alloc_cursor(mp, tp, btnum, ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); - return cur; -} - -/* - * Allocate a new allocation btree cursor. - */ -struct xfs_btree_cur * /* new alloc btree cursor */ -xfs_allocbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for agf structure */ - struct xfs_perag *pag, - xfs_btnum_t btnum) /* btree identifier */ -{ - struct xfs_agf *agf = agbp->b_addr; - struct xfs_btree_cur *cur; - - cur = xfs_allocbt_init_common(mp, tp, pag, btnum); - if (btnum == XFS_BTNUM_CNT) - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); - else - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); - cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; + cur->bc_nlevels = (btnum == XFS_BTNUM_BNO) ? + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) : + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + } return cur; } @@ -570,7 +558,7 @@ xfs_allocbt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_allocbt_init_common(mp, NULL, pag, btnum); + cur = xfs_allocbt_init_cursor(mp, NULL, NULL, pag, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } -- cgit v1.2.3 From 91796b2eef8bd725873bec326a7be830a68a11ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:37 -0800 Subject: xfs: remove xfs_allocbt_stage_cursor Just open code the two calls in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 15 --------------- fs/xfs/libxfs/xfs_alloc_btree.h | 3 --- fs/xfs/scrub/alloc_repair.c | 11 +++++++---- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 847674658d67..99859803bb0b 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -548,21 +548,6 @@ xfs_allocbt_init_cursor( return cur; } -/* Create a free space btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_allocbt_stage_cursor( - struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag, - xfs_btnum_t btnum) -{ - struct xfs_btree_cur *cur; - - cur = xfs_allocbt_init_cursor(mp, NULL, NULL, pag, btnum); - xfs_btree_stage_afakeroot(cur, afake); - return cur; -} - /* * Install a new free space btree root. Caller is responsible for invalidating * and freeing the old btree blocks. diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 45df893ef6bb..1c910862535f 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -50,9 +50,6 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_perag *pag, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag, - xfs_btnum_t btnum); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index 45edda096869..544c53d450ce 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -735,10 +735,13 @@ xrep_abt_build_new_trees( ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; /* Allocate cursors for the staged btrees. */ - bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake, - pag, XFS_BTNUM_BNO); - cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake, - pag, XFS_BTNUM_CNT); + bno_cur = xfs_allocbt_init_cursor(sc->mp, NULL, NULL, pag, + XFS_BTNUM_BNO); + xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); + + cnt_cur = xfs_allocbt_init_cursor(sc->mp, NULL, NULL, pag, + XFS_BTNUM_CNT); + xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); /* Last chance to abort before we start committing fixes. */ if (xchk_should_terminate(sc, &error)) -- cgit v1.2.3 From f6c98d921a9e5b753ac1a35d540a6487ee111a33 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:38 -0800 Subject: xfs: fold xfs_inobt_init_common into xfs_inobt_init_cursor Make the levels initialization in xfs_inobt_init_cursor conditional and merge the two helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc_btree.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 0d04ac32367d..48bfea0e2a20 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -455,12 +455,15 @@ const struct xfs_btree_ops xfs_finobt_ops = { }; /* - * Initialize a new inode btree cursor. + * Create an inode btree cursor. + * + * For staging cursors tp and agbp are NULL. */ -static struct xfs_btree_cur * -xfs_inobt_init_common( +struct xfs_btree_cur * +xfs_inobt_init_cursor( struct xfs_perag *pag, - struct xfs_trans *tp, /* transaction pointer */ + struct xfs_trans *tp, + struct xfs_buf *agbp, xfs_btnum_t btnum) /* ialloc or free ino btree */ { struct xfs_mount *mp = pag->pag_mount; @@ -475,26 +478,14 @@ xfs_inobt_init_common( cur = xfs_btree_alloc_cursor(mp, tp, btnum, ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); - return cur; -} - -/* Create an inode btree cursor. */ -struct xfs_btree_cur * -xfs_inobt_init_cursor( - struct xfs_perag *pag, - struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_btnum_t btnum) -{ - struct xfs_btree_cur *cur; - struct xfs_agi *agi = agbp->b_addr; - - cur = xfs_inobt_init_common(pag, tp, btnum); - if (btnum == XFS_BTNUM_INO) - cur->bc_nlevels = be32_to_cpu(agi->agi_level); - else - cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agi *agi = agbp->b_addr; + + cur->bc_nlevels = (btnum == XFS_BTNUM_INO) ? + be32_to_cpu(agi->agi_level) : + be32_to_cpu(agi->agi_free_level); + } return cur; } @@ -507,7 +498,7 @@ xfs_inobt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_inobt_init_common(pag, NULL, btnum); + cur = xfs_inobt_init_cursor(pag, NULL, NULL, btnum); xfs_btree_stage_afakeroot(cur, afake); return cur; } -- cgit v1.2.3 From 6234dee7e6f58676379f3a2d8b0629a6e9a427fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:39 -0800 Subject: xfs: remove xfs_inobt_stage_cursor Just open code the two calls in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc_btree.c | 14 -------------- fs/xfs/libxfs/xfs_ialloc_btree.h | 2 -- fs/xfs/scrub/ialloc_repair.c | 9 +++++---- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 48bfea0e2a20..b45a2e581313 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -489,20 +489,6 @@ xfs_inobt_init_cursor( return cur; } -/* Create an inode btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_inobt_stage_cursor( - struct xfs_perag *pag, - struct xbtree_afakeroot *afake, - xfs_btnum_t btnum) -{ - struct xfs_btree_cur *cur; - - cur = xfs_inobt_init_cursor(pag, NULL, NULL, btnum); - xfs_btree_stage_afakeroot(cur, afake); - return cur; -} - /* * Install a new inobt btree root. Caller is responsible for invalidating * and freeing the old btree blocks. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 3262c3fe5ebe..40f0fc0e8da3 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -48,8 +48,6 @@ struct xfs_perag; extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum); -struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag, - struct xbtree_afakeroot *afake, xfs_btnum_t btnum); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c index e94f10800082..04e186d8c738 100644 --- a/fs/xfs/scrub/ialloc_repair.c +++ b/fs/xfs/scrub/ialloc_repair.c @@ -663,8 +663,8 @@ xrep_ibt_build_new_trees( ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; ri->new_inobt.bload.get_records = xrep_ibt_get_records; - ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake, - XFS_BTNUM_INO); + ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL, XFS_BTNUM_INO); + xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake); error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, xfarray_length(ri->inode_records)); if (error) @@ -684,8 +684,9 @@ xrep_ibt_build_new_trees( ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; ri->new_finobt.bload.get_records = xrep_fibt_get_records; - fino_cur = xfs_inobt_stage_cursor(sc->sa.pag, - &ri->new_finobt.afake, XFS_BTNUM_FINO); + fino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL, + XFS_BTNUM_FINO); + xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake); error = xfs_btree_bload_compute_geometry(fino_cur, &ri->new_finobt.bload, ri->finobt_recs); if (error) -- cgit v1.2.3 From 4f2dc69e4bcb4b3bfaea0a96ac6424b0ed998172 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:39 -0800 Subject: xfs: fold xfs_refcountbt_init_common into xfs_refcountbt_init_cursor Make the levels initialization in xfs_refcountbt_init_cursor conditional and merge the two helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_refcount_btree.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 966e87db2403..57710856ae34 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -346,12 +346,15 @@ const struct xfs_btree_ops xfs_refcountbt_ops = { }; /* - * Initialize a new refcount btree cursor. + * Create a new refcount btree cursor. + * + * For staging cursors tp and agbp are NULL. */ -static struct xfs_btree_cur * -xfs_refcountbt_init_common( +struct xfs_btree_cur * +xfs_refcountbt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, + struct xfs_buf *agbp, struct xfs_perag *pag) { struct xfs_btree_cur *cur; @@ -364,23 +367,12 @@ xfs_refcountbt_init_common( cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_refc.nr_ops = 0; cur->bc_refc.shape_changes = 0; - return cur; -} - -/* Create a btree cursor. */ -struct xfs_btree_cur * -xfs_refcountbt_init_cursor( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_buf *agbp, - struct xfs_perag *pag) -{ - struct xfs_agf *agf = agbp->b_addr; - struct xfs_btree_cur *cur; - - cur = xfs_refcountbt_init_common(mp, tp, pag); - cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; + + cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); + } return cur; } @@ -393,7 +385,7 @@ xfs_refcountbt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_refcountbt_init_common(mp, NULL, pag); + cur = xfs_refcountbt_init_cursor(mp, NULL, NULL, pag); xfs_btree_stage_afakeroot(cur, afake); return cur; } -- cgit v1.2.3 From a5c2194406f322e91b90fb813128541a9b4fed6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:40 -0800 Subject: xfs: remove xfs_refcountbt_stage_cursor Just open code the two calls in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_refcount_btree.c | 14 -------------- fs/xfs/libxfs/xfs_refcount_btree.h | 2 -- fs/xfs/scrub/refcount_repair.c | 4 ++-- 3 files changed, 2 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 57710856ae34..4dcf6295e683 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -376,20 +376,6 @@ xfs_refcountbt_init_cursor( return cur; } -/* Create a btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_refcountbt_stage_cursor( - struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag) -{ - struct xfs_btree_cur *cur; - - cur = xfs_refcountbt_init_cursor(mp, NULL, NULL, pag); - xfs_btree_stage_afakeroot(cur, afake); - return cur; -} - /* * Swap in the new btree root. Once we pass this point the newly rebuilt btree * is in place and we have to kill off all the old btree blocks. diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index d66b37259bed..1e0ab25f6c68 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -48,8 +48,6 @@ struct xbtree_afakeroot; extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_perag *pag); -struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag); extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf); extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index 9c39af03ee1d..8240c993061b 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -658,8 +658,8 @@ xrep_refc_build_new_tree( rr->new_btree.bload.claim_block = xrep_refc_claim_block; /* Compute how many blocks we'll need. */ - refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake, - pag); + refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake); error = xfs_btree_bload_compute_geometry(refc_cur, &rr->new_btree.bload, xfarray_length(rr->refcount_records)); -- cgit v1.2.3 From c49a4b2f0ef0ac5daee5c2a3cfd2b537345c34eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:41 -0800 Subject: xfs: fold xfs_rmapbt_init_common into xfs_rmapbt_init_cursor Make the levels initialization in xfs_rmapbt_init_cursor conditional and merge the two helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_rmap_btree.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 84eb767425cb..dda9f8844d84 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -502,21 +502,11 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .keys_contiguous = xfs_rmapbt_keys_contiguous, }; -static struct xfs_btree_cur * -xfs_rmapbt_init_common( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_perag *pag) -{ - struct xfs_btree_cur *cur; - - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, - mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); - cur->bc_ag.pag = xfs_perag_hold(pag); - return cur; -} - -/* Create a new reverse mapping btree cursor. */ +/* + * Create a new reverse mapping btree cursor. + * + * For staging cursors tp and agbp are NULL. + */ struct xfs_btree_cur * xfs_rmapbt_init_cursor( struct xfs_mount *mp, @@ -524,12 +514,17 @@ xfs_rmapbt_init_cursor( struct xfs_buf *agbp, struct xfs_perag *pag) { - struct xfs_agf *agf = agbp->b_addr; struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, tp, pag); - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, + mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); + cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; + + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + } return cur; } @@ -542,7 +537,7 @@ xfs_rmapbt_stage_cursor( { struct xfs_btree_cur *cur; - cur = xfs_rmapbt_init_common(mp, NULL, pag); + cur = xfs_rmapbt_init_cursor(mp, NULL, NULL, pag); xfs_btree_stage_afakeroot(cur, afake); return cur; } -- cgit v1.2.3 From 1317813290be04bc37196c4adf457712238c7faa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:42 -0800 Subject: xfs: remove xfs_rmapbt_stage_cursor xfs_rmapbt_stage_cursor is currently unused, but future callers can trivially open code the two calls. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_rmap_btree.c | 14 -------------- fs/xfs/libxfs/xfs_rmap_btree.h | 2 -- 2 files changed, 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index dda9f8844d84..b0da31f49ca8 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -528,20 +528,6 @@ xfs_rmapbt_init_cursor( return cur; } -/* Create a new reverse mapping btree cursor with a fake root for staging. */ -struct xfs_btree_cur * -xfs_rmapbt_stage_cursor( - struct xfs_mount *mp, - struct xbtree_afakeroot *afake, - struct xfs_perag *pag) -{ - struct xfs_btree_cur *cur; - - cur = xfs_rmapbt_init_cursor(mp, NULL, NULL, pag); - xfs_btree_stage_afakeroot(cur, afake); - return cur; -} - /* * Install a new reverse mapping btree root. Caller is responsible for * invalidating and freeing the old btree blocks. diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 3244715dd111..27536d7e14aa 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -44,8 +44,6 @@ struct xbtree_afakeroot; struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_perag *pag); -struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, - struct xbtree_afakeroot *afake, struct xfs_perag *pag); void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); int xfs_rmapbt_maxrecs(int blocklen, int leaf); -- cgit v1.2.3 From 579d7022d1afea8f4475d1750224ec0b652febee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:43 -0800 Subject: xfs: make full use of xfs_btree_stage_ifakeroot in xfs_bmbt_stage_cursor Remove the duplicate cur->bc_nlevels assignment in xfs_bmbt_stage_cursor, and move the cur->bc_ino.forksize assignment into xfs_btree_stage_ifakeroot as it is part of setting up the fake btree root. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap_btree.c | 2 -- fs/xfs/libxfs/xfs_btree_staging.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index ec0b970157ae..3b6f14196c8c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -612,8 +612,6 @@ xfs_bmbt_stage_cursor( /* data fork always has larger maxheight */ cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); - cur->bc_nlevels = ifake->if_levels; - cur->bc_ino.forksize = ifake->if_fork_size; /* Don't let anyone think we're attached to the real fork yet. */ cur->bc_ino.whichfork = -1; diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 185d6c3bbc02..d721719f759a 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -133,6 +133,7 @@ xfs_btree_stage_ifakeroot( cur->bc_ino.ifake = ifake; cur->bc_nlevels = ifake->if_levels; + cur->bc_ino.forksize = ifake->if_fork_size; cur->bc_flags |= XFS_BTREE_STAGING; } -- cgit v1.2.3 From 42e357c806c8c0ffb9c5c2faa4ad034bfe950d77 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:39:43 -0800 Subject: xfs: make staging file forks explicit Don't open-code "-1" for whichfork when we're creating a staging btree for a repair; let's define an actual symbol to make grepping and understanding easier. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_types.h | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 3b6f14196c8c..7381e507b32b 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -614,7 +614,7 @@ xfs_bmbt_stage_cursor( cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); /* Don't let anyone think we're attached to the real fork yet. */ - cur->bc_ino.whichfork = -1; + cur->bc_ino.whichfork = XFS_STAGING_FORK; xfs_btree_stage_ifakeroot(cur, ifake); return cur; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 62e02d5380ad..a1004fb3c8fb 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -80,11 +80,13 @@ typedef void * xfs_failaddr_t; /* * Inode fork identifiers. */ -#define XFS_DATA_FORK 0 -#define XFS_ATTR_FORK 1 -#define XFS_COW_FORK 2 +#define XFS_STAGING_FORK (-1) /* fake fork for staging a btree */ +#define XFS_DATA_FORK (0) +#define XFS_ATTR_FORK (1) +#define XFS_COW_FORK (2) #define XFS_WHICHFORK_STRINGS \ + { XFS_STAGING_FORK, "staging" }, \ { XFS_DATA_FORK, "data" }, \ { XFS_ATTR_FORK, "attr" }, \ { XFS_COW_FORK, "cow" } -- cgit v1.2.3 From 802f91f7b1d535ac975e2d696bf5b5dea82816e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:44 -0800 Subject: xfs: fold xfs_bmbt_init_common into xfs_bmbt_init_cursor Make the levels initialization in xfs_bmbt_init_cursor conditional and merge the two helpers. This requires the fakeroot case to now pass a -1 whichfork directly into xfs_bmbt_init_cursor, and some special casing for that, but at least this scheme to deal with the fake btree root is handled and documented in once place now. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [djwong: tidy up a multline ternary] Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap_btree.c | 56 ++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 7381e507b32b..c338c5be4c67 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -544,27 +544,10 @@ const struct xfs_btree_ops xfs_bmbt_ops = { .keys_contiguous = xfs_bmbt_keys_contiguous, }; -static struct xfs_btree_cur * -xfs_bmbt_init_common( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_btree_cur *cur; - - ASSERT(whichfork != XFS_COW_FORK); - - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, &xfs_bmbt_ops, - mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - - cur->bc_ino.ip = ip; - cur->bc_bmap.allocated = 0; - return cur; -} - /* - * Allocate a new bmap btree cursor. + * Create a new bmap btree cursor. + * + * For staging cursors -1 in passed in whichfork. */ struct xfs_btree_cur * xfs_bmbt_init_cursor( @@ -573,15 +556,34 @@ xfs_bmbt_init_cursor( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; + unsigned int maxlevels; - cur = xfs_bmbt_init_common(mp, tp, ip, whichfork); + ASSERT(whichfork != XFS_COW_FORK); - cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; - cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); + /* + * The Data fork always has larger maxlevel, so use that for staging + * cursors. + */ + switch (whichfork) { + case XFS_STAGING_FORK: + maxlevels = mp->m_bm_maxlevels[XFS_DATA_FORK]; + break; + default: + maxlevels = mp->m_bm_maxlevels[whichfork]; + break; + } + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, &xfs_bmbt_ops, + maxlevels, xfs_bmbt_cur_cache); + cur->bc_ino.ip = ip; cur->bc_ino.whichfork = whichfork; + cur->bc_bmap.allocated = 0; + if (whichfork != XFS_STAGING_FORK) { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); + } return cur; } @@ -610,11 +612,7 @@ xfs_bmbt_stage_cursor( { struct xfs_btree_cur *cur; - /* data fork always has larger maxheight */ - cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); - - /* Don't let anyone think we're attached to the real fork yet. */ - cur->bc_ino.whichfork = XFS_STAGING_FORK; + cur = xfs_bmbt_init_cursor(mp, NULL, ip, XFS_STAGING_FORK); xfs_btree_stage_ifakeroot(cur, ifake); return cur; } -- cgit v1.2.3 From 02f7ebf5f99c3776bbf048786885eeafeb2f21ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:45 -0800 Subject: xfs: remove xfs_bmbt_stage_cursor Just open code the two calls in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap_btree.c | 19 ------------------- fs/xfs/libxfs/xfs_bmap_btree.h | 2 -- fs/xfs/scrub/bmap_repair.c | 8 +++++++- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index c338c5be4c67..1104bf4098e2 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -598,25 +598,6 @@ xfs_bmbt_block_maxrecs( return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); } -/* - * Allocate a new bmap btree cursor for reloading an inode block mapping data - * structure. Note that callers can use the staged cursor to reload extents - * format inode forks if they rebuild the iext tree and commit the staged - * cursor immediately. - */ -struct xfs_btree_cur * -xfs_bmbt_stage_cursor( - struct xfs_mount *mp, - struct xfs_inode *ip, - struct xbtree_ifakeroot *ifake) -{ - struct xfs_btree_cur *cur; - - cur = xfs_bmbt_init_cursor(mp, NULL, ip, XFS_STAGING_FORK); - xfs_btree_stage_ifakeroot(cur, ifake); - return cur; -} - /* * Swap in the new inode fork root. Once we pass this point the newly rebuilt * mappings are in place and we have to kill off any old btree blocks. diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index e93aa42e2bf5..de1b73f1225c 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -107,8 +107,6 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); -struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp, - struct xfs_inode *ip, struct xbtree_ifakeroot *ifake); void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, int whichfork); diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index a4bb89fdd510..1e656fab5e41 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -639,7 +639,13 @@ xrep_bmap_build_new_fork( rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; - bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake); + + /* + * Allocate a new bmap btree cursor for reloading an inode block mapping + * data structure. + */ + bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK); + xfs_btree_stage_ifakeroot(bmap_cur, ifake); /* * Figure out the size and format of the new fork, then fill it with -- cgit v1.2.3 From e45ea3645178c6db91aef4314945b05e4c6ee1fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:46 -0800 Subject: xfs: split the agf_roots and agf_levels arrays Using arrays of largely unrelated fields that use the btree number as index is not very robust. Split the arrays into three separate fields instead. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 13 +++++------ fs/xfs/libxfs/xfs_ag.h | 8 ++++--- fs/xfs/libxfs/xfs_alloc.c | 49 +++++++++++++++----------------------- fs/xfs/libxfs/xfs_alloc_btree.c | 52 +++++++++++++++++++++++++++-------------- fs/xfs/libxfs/xfs_format.h | 21 ++++++++--------- fs/xfs/libxfs/xfs_rmap_btree.c | 17 +++++++------- fs/xfs/scrub/agheader.c | 12 +++++----- fs/xfs/scrub/agheader_repair.c | 30 ++++++++---------------- fs/xfs/scrub/alloc_repair.c | 18 +++++++------- fs/xfs/xfs_trace.h | 10 ++++---- 10 files changed, 111 insertions(+), 119 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 446733893764..e598e14320be 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -669,14 +669,13 @@ xfs_agfblock_init( agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(id->agno); agf->agf_length = cpu_to_be32(id->agsize); - agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); - agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); - agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + agf->agf_bno_root = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_cnt_root = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_bno_level = cpu_to_be32(1); + agf->agf_cnt_level = cpu_to_be32(1); if (xfs_has_rmapbt(mp)) { - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(XFS_RMAP_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_root = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_rmap_level = cpu_to_be32(1); agf->agf_rmap_blocks = cpu_to_be32(1); } diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 77c0fa2bb510..19eddba09894 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -36,8 +36,9 @@ struct xfs_perag { atomic_t pag_active_ref; /* active reference count */ wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */ unsigned long pag_opstate; - uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ + uint8_t pagf_bno_level; /* # of levels in bno btree */ + uint8_t pagf_cnt_level; /* # of levels in cnt btree */ + uint8_t pagf_rmap_level;/* # of levels in rmap btree */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ @@ -86,7 +87,8 @@ struct xfs_perag { * Alternate btree heights so that online repair won't trip the write * verifiers while rebuilding the AG btrees. */ - uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; + uint8_t pagf_repair_bno_level; + uint8_t pagf_repair_cnt_level; uint8_t pagf_repair_refcount_level; #endif diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 91553a54dc30..7300dc219589 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2335,8 +2335,9 @@ xfs_alloc_min_freelist( struct xfs_perag *pag) { /* AG btrees have at least 1 level. */ - static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; - const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; + const unsigned int bno_level = pag ? pag->pagf_bno_level : 1; + const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1; + const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1; unsigned int min_free; ASSERT(mp->m_alloc_maxlevels > 0); @@ -2363,16 +2364,12 @@ xfs_alloc_min_freelist( */ /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_alloc_maxlevels) * 2 - 2; + min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_alloc_maxlevels) * 2 - 2; + min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) - min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels) * 2 - 2; - + min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2; return min_free; } @@ -3056,8 +3053,8 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_versionnum), offsetof(xfs_agf_t, agf_seqno), offsetof(xfs_agf_t, agf_length), - offsetof(xfs_agf_t, agf_roots[0]), - offsetof(xfs_agf_t, agf_levels[0]), + offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */ + offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */ offsetof(xfs_agf_t, agf_flfirst), offsetof(xfs_agf_t, agf_fllast), offsetof(xfs_agf_t, agf_flcount), @@ -3236,12 +3233,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_freeblks) > agf_length) return __this_address; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > - mp->m_alloc_maxlevels || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > - mp->m_alloc_maxlevels) + if (be32_to_cpu(agf->agf_bno_level) < 1 || + be32_to_cpu(agf->agf_cnt_level) < 1 || + be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels || + be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels) return __this_address; if (xfs_has_lazysbcount(mp) && @@ -3252,9 +3247,8 @@ xfs_agf_verify( if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) return __this_address; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > - mp->m_rmap_maxlevels) + if (be32_to_cpu(agf->agf_rmap_level) < 1 || + be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels) return __this_address; } @@ -3380,12 +3374,9 @@ xfs_alloc_read_agf( pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); if (xfs_agfl_needs_reset(pag->pag_mount, agf)) set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); @@ -3414,10 +3405,8 @@ xfs_alloc_read_agf( ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); - ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); - ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); + ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); + ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); } #endif if (agfbpp) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 99859803bb0b..6c09573a98a9 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -38,13 +38,18 @@ xfs_allocbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - int btnum = cur->bc_btnum; ASSERT(ptr->s != 0); - agf->agf_roots[btnum] = ptr->s; - be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_ag.pag->pagf_levels[btnum] += inc; + if (cur->bc_btnum == XFS_BTNUM_BNO) { + agf->agf_bno_root = ptr->s; + be32_add_cpu(&agf->agf_bno_level, inc); + cur->bc_ag.pag->pagf_bno_level += inc; + } else { + agf->agf_cnt_root = ptr->s; + be32_add_cpu(&agf->agf_cnt_level, inc); + cur->bc_ag.pag->pagf_cnt_level += inc; + } xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -226,7 +231,10 @@ xfs_allocbt_init_ptr_from_cur( ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); - ptr->s = agf->agf_roots[cur->bc_btnum]; + if (cur->bc_btnum == XFS_BTNUM_BNO) + ptr->s = agf->agf_bno_root; + else + ptr->s = agf->agf_cnt_root; } STATIC int64_t @@ -299,7 +307,6 @@ xfs_allocbt_verify( struct xfs_perag *pag = bp->b_pag; xfs_failaddr_t fa; unsigned int level; - xfs_btnum_t btnum = XFS_BTNUM_BNOi; if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; @@ -320,21 +327,27 @@ xfs_allocbt_verify( * against. */ level = be16_to_cpu(block->bb_level); - if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) - btnum = XFS_BTNUM_CNTi; if (pag && xfs_perag_initialised_agf(pag)) { - unsigned int maxlevel = pag->pagf_levels[btnum]; + unsigned int maxlevel, repair_maxlevel = 0; -#ifdef CONFIG_XFS_ONLINE_REPAIR /* * Online repair could be rewriting the free space btrees, so * we'll validate against the larger of either tree while this * is going on. */ - maxlevel = max_t(unsigned int, maxlevel, - pag->pagf_repair_levels[btnum]); + if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) { + maxlevel = pag->pagf_cnt_level; +#ifdef CONFIG_XFS_ONLINE_REPAIR + repair_maxlevel = pag->pagf_repair_cnt_level; #endif - if (level >= maxlevel) + } else { + maxlevel = pag->pagf_bno_level; +#ifdef CONFIG_XFS_ONLINE_REPAIR + repair_maxlevel = pag->pagf_repair_bno_level; +#endif + } + + if (level >= max(maxlevel, repair_maxlevel)) return __this_address; } else if (level >= mp->m_alloc_maxlevels) return __this_address; @@ -542,8 +555,8 @@ xfs_allocbt_init_cursor( struct xfs_agf *agf = agbp->b_addr; cur->bc_nlevels = (btnum == XFS_BTNUM_BNO) ? - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) : - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + be32_to_cpu(agf->agf_bno_level) : + be32_to_cpu(agf->agf_cnt_level); } return cur; } @@ -563,8 +576,13 @@ xfs_allocbt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); - agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + if (cur->bc_btnum == XFS_BTNUM_BNO) { + agf->agf_bno_root = cpu_to_be32(afake->af_root); + agf->agf_bno_level = cpu_to_be32(afake->af_levels); + } else { + agf->agf_cnt_root = cpu_to_be32(afake->af_root); + agf->agf_cnt_level = cpu_to_be32(afake->af_levels); + } xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); xfs_btree_commit_afakeroot(cur, tp, agbp); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 382ab1e71c0b..2b2f9050fbfb 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -477,15 +477,9 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) #define XFS_AGI_GOOD_VERSION(v) ((v) == XFS_AGI_VERSION) /* - * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the - * arrays below. - */ -#define XFS_BTNUM_AGF ((int)XFS_BTNUM_RMAPi + 1) - -/* - * The second word of agf_levels in the first a.g. overlaps the EFS - * superblock's magic number. Since the magic numbers valid for EFS - * are > 64k, our value cannot be confused for an EFS superblock's. + * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number. + * Since the magic numbers valid for EFS are > 64k, our value cannot be confused + * for an EFS superblock. */ typedef struct xfs_agf { @@ -499,8 +493,13 @@ typedef struct xfs_agf { /* * Freespace and rmap information */ - __be32 agf_roots[XFS_BTNUM_AGF]; /* root blocks */ - __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ + __be32 agf_bno_root; /* bnobt root block */ + __be32 agf_cnt_root; /* cntbt root block */ + __be32 agf_rmap_root; /* rmapbt root block */ + + __be32 agf_bno_level; /* bnobt btree levels */ + __be32 agf_cnt_level; /* cntbt btree levels */ + __be32 agf_rmap_level; /* rmapbt btree levels */ __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index b0da31f49ca8..e050d7342d6c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -65,13 +65,12 @@ xfs_rmapbt_set_root( { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - int btnum = cur->bc_btnum; ASSERT(ptr->s != 0); - agf->agf_roots[btnum] = ptr->s; - be32_add_cpu(&agf->agf_levels[btnum], inc); - cur->bc_ag.pag->pagf_levels[btnum] += inc; + agf->agf_rmap_root = ptr->s; + be32_add_cpu(&agf->agf_rmap_level, inc); + cur->bc_ag.pag->pagf_rmap_level += inc; xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } @@ -222,7 +221,7 @@ xfs_rmapbt_init_ptr_from_cur( ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); - ptr->s = agf->agf_roots[cur->bc_btnum]; + ptr->s = agf->agf_rmap_root; } /* @@ -342,7 +341,7 @@ xfs_rmapbt_verify( level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) + if (level >= pag->pagf_rmap_level) return __this_address; } else if (level >= mp->m_rmap_maxlevels) return __this_address; @@ -523,7 +522,7 @@ xfs_rmapbt_init_cursor( if (agbp) { struct xfs_agf *agf = agbp->b_addr; - cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level); } return cur; } @@ -543,8 +542,8 @@ xfs_rmapbt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root); - agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels); + agf->agf_rmap_root = cpu_to_be32(afake->af_root); + agf->agf_rmap_level = cpu_to_be32(afake->af_levels); agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks); xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS | XFS_AGF_RMAP_BLOCKS); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 6c6e5eba42c8..e954f07679dd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -556,28 +556,28 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); + agbno = be32_to_cpu(agf->agf_bno_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); + agbno = be32_to_cpu(agf->agf_cnt_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + level = be32_to_cpu(agf->agf_bno_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + level = be32_to_cpu(agf->agf_cnt_level); if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (xfs_has_rmapbt(mp)) { - agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); + agbno = be32_to_cpu(agf->agf_rmap_root); if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + level = be32_to_cpu(agf->agf_rmap_level); if (level <= 0 || level > mp->m_rmap_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 26bd1ff68f1b..e4dd4fe84c5f 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -174,8 +174,7 @@ xrep_agf_find_btrees( * We relied on the rmapbt to reconstruct the AGF. If we get a * different root then something's seriously wrong. */ - if (fab[XREP_AGF_RMAPBT].root != - be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi])) + if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root)) return -EFSCORRUPTED; /* We must find the refcountbt root if that feature is enabled. */ @@ -224,20 +223,14 @@ xrep_agf_set_roots( struct xfs_agf *agf, struct xrep_find_ag_btree *fab) { - agf->agf_roots[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].root); - agf->agf_levels[XFS_BTNUM_BNOi] = - cpu_to_be32(fab[XREP_AGF_BNOBT].height); + agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root); + agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height); - agf->agf_roots[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].root); - agf->agf_levels[XFS_BTNUM_CNTi] = - cpu_to_be32(fab[XREP_AGF_CNTBT].height); + agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root); + agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height); - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].root); - agf->agf_levels[XFS_BTNUM_RMAPi] = - cpu_to_be32(fab[XREP_AGF_RMAPBT].height); + agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root); + agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height); if (xfs_has_reflink(sc->mp)) { agf->agf_refcount_root = @@ -333,12 +326,9 @@ xrep_agf_commit_new( pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_longest = be32_to_cpu(agf->agf_longest); - pag->pagf_levels[XFS_BTNUM_BNOi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); - pag->pagf_levels[XFS_BTNUM_CNTi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - pag->pagf_levels[XFS_BTNUM_RMAPi] = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); + pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); + pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); + pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index 544c53d450ce..0ef27aacbf25 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -687,8 +687,8 @@ xrep_abt_reset_counters( * height values before re-initializing the perag info from the updated * AGF to capture all the new values. */ - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi]; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi]; + pag->pagf_repair_bno_level = pag->pagf_bno_level; + pag->pagf_repair_cnt_level = pag->pagf_cnt_level; /* Reinitialize with the values we just logged. */ return xrep_reinit_pagf(sc); @@ -768,10 +768,8 @@ xrep_abt_build_new_trees( * height so that we don't trip the verifiers when writing the new * btree blocks to disk. */ - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = - ra->new_bnobt.bload.btree_height; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = - ra->new_cntbt.bload.btree_height; + pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height; + pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height; /* Load the free space by length tree. */ ra->array_cur = XFARRAY_CURSOR_INIT; @@ -810,8 +808,8 @@ xrep_abt_build_new_trees( return xrep_roll_ag_trans(sc); err_levels: - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; err_cur: xfs_btree_del_cursor(cnt_cur, error); xfs_btree_del_cursor(bno_cur, error); @@ -841,8 +839,8 @@ xrep_abt_remove_old_trees( * Now that we've zapped all the old allocbt blocks we can turn off * the alternate height mechanism. */ - pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; - pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + pag->pagf_repair_bno_level = 0; + pag->pagf_repair_cnt_level = 0; return 0; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 9d0d7085b079..e9c760fe7aa9 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1710,12 +1710,10 @@ DECLARE_EVENT_CLASS(xfs_agf_class, __entry->agno = be32_to_cpu(agf->agf_seqno), __entry->flags = flags; __entry->length = be32_to_cpu(agf->agf_length), - __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), - __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), - __entry->bno_level = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), - __entry->cnt_level = - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), + __entry->bno_root = be32_to_cpu(agf->agf_bno_root), + __entry->cnt_root = be32_to_cpu(agf->agf_cnt_root), + __entry->bno_level = be32_to_cpu(agf->agf_bno_level), + __entry->cnt_level = be32_to_cpu(agf->agf_cnt_level), __entry->flfirst = be32_to_cpu(agf->agf_flfirst), __entry->fllast = be32_to_cpu(agf->agf_fllast), __entry->flcount = be32_to_cpu(agf->agf_flcount), -- cgit v1.2.3 From 77953b97bb19dc031673d055c811a5ba7df92307 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:47 -0800 Subject: xfs: add a name field to struct xfs_btree_ops The btnum in struct xfs_btree_ops is often used for printing a symbolic name for the btree. Add a name field to the ops structure and use that directly. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 8 ++--- fs/xfs/libxfs/xfs_alloc_btree.c | 2 ++ fs/xfs/libxfs/xfs_bmap_btree.c | 1 + fs/xfs/libxfs/xfs_btree.c | 8 ++--- fs/xfs/libxfs/xfs_btree.h | 2 ++ fs/xfs/libxfs/xfs_ialloc.c | 5 ++- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 ++ fs/xfs/libxfs/xfs_refcount_btree.c | 1 + fs/xfs/libxfs/xfs_rmap_btree.c | 1 + fs/xfs/libxfs/xfs_types.h | 9 ----- fs/xfs/scrub/trace.h | 40 +++++++++++----------- fs/xfs/xfs_trace.h | 70 +++++++++++++++++++------------------- 12 files changed, 73 insertions(+), 76 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 7300dc219589..44d4f0da90ba 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -273,9 +273,8 @@ xfs_alloc_complain_bad_rec( struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, - "%s Freespace BTree record corruption in AG %d detected at %pS!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", - cur->bc_ag.pag->pag_agno, fa); + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start block 0x%x block count 0x%x", irec->ar_startblock, irec->ar_blockcount); @@ -996,8 +995,7 @@ xfs_alloc_cur_check( out: if (deactivate) cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; - trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, - *new); + trace_xfs_alloc_cur_check(cur, bno, len, diff, *new); return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6c09573a98a9..262f5dc3a483 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -468,6 +468,7 @@ xfs_allocbt_keys_contiguous( } const struct xfs_btree_ops xfs_bnobt_ops = { + .name = "bno", .type = XFS_BTREE_TYPE_AG, .rec_len = sizeof(xfs_alloc_rec_t), @@ -497,6 +498,7 @@ const struct xfs_btree_ops xfs_bnobt_ops = { }; const struct xfs_btree_ops xfs_cntbt_ops = { + .name = "cnt", .type = XFS_BTREE_TYPE_AG, .geom_flags = XFS_BTGEO_LASTREC_UPDATE, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1104bf4098e2..25193551e95b 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -517,6 +517,7 @@ xfs_bmbt_keys_contiguous( } const struct xfs_btree_ops xfs_bmbt_ops = { + .name = "bmap", .type = XFS_BTREE_TYPE_INODE, .rec_len = sizeof(xfs_bmbt_rec_t), diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2649f24ed748..278461d0f64d 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -298,17 +298,17 @@ xfs_btree_check_ptr( level)) return 0; xfs_err(cur->bc_mp, -"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.", +"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, - cur->bc_ino.whichfork, cur->bc_btnum, + cur->bc_ino.whichfork, cur->bc_ops->name, level, index); } else { if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), level)) return 0; xfs_err(cur->bc_mp, -"AG %u: Corrupt btree %d pointer at level %d index %d.", - cur->bc_ag.pag->pag_agno, cur->bc_btnum, +"AG %u: Corrupt %sbt pointer at level %d index %d.", + cur->bc_ag.pag->pag_agno, cur->bc_ops->name, level, index); } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 99194ae94694..6bc6096205b3 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -123,6 +123,8 @@ enum xfs_btree_type { }; struct xfs_btree_ops { + const char *name; + /* Type of btree - AG-rooted or inode-rooted */ enum xfs_btree_type type; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 1ff867075026..52b82cd7e6c9 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -141,9 +141,8 @@ xfs_inobt_complain_bad_rec( struct xfs_mount *mp = cur->bc_mp; xfs_warn(mp, - "%s Inode BTree record corruption in AG %d detected at %pS!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", - cur->bc_ag.pag->pag_agno, fa); + "%sbt record corruption in AG %d detected at %pS!", + cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b45a2e581313..ddb9a226914a 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -399,6 +399,7 @@ xfs_inobt_keys_contiguous( } const struct xfs_btree_ops xfs_inobt_ops = { + .name = "ino", .type = XFS_BTREE_TYPE_AG, .rec_len = sizeof(xfs_inobt_rec_t), @@ -427,6 +428,7 @@ const struct xfs_btree_ops xfs_inobt_ops = { }; const struct xfs_btree_ops xfs_finobt_ops = { + .name = "fino", .type = XFS_BTREE_TYPE_AG, .rec_len = sizeof(xfs_inobt_rec_t), diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 4dcf6295e683..16677cbbddfc 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -318,6 +318,7 @@ xfs_refcountbt_keys_contiguous( } const struct xfs_btree_ops xfs_refcountbt_ops = { + .name = "refcount", .type = XFS_BTREE_TYPE_AG, .rec_len = sizeof(struct xfs_refcount_rec), diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index e050d7342d6c..e1ddf814492c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -472,6 +472,7 @@ xfs_rmapbt_keys_contiguous( } const struct xfs_btree_ops xfs_rmapbt_ops = { + .name = "rmap", .type = XFS_BTREE_TYPE_AG, .geom_flags = XFS_BTGEO_OVERLAPPING, diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index a1004fb3c8fb..f577247b748d 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -125,15 +125,6 @@ typedef enum { XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX } xfs_btnum_t; -#define XFS_BTNUM_STRINGS \ - { XFS_BTNUM_BNOi, "bnobt" }, \ - { XFS_BTNUM_CNTi, "cntbt" }, \ - { XFS_BTNUM_RMAPi, "rmapbt" }, \ - { XFS_BTNUM_BMAPi, "bmbt" }, \ - { XFS_BTNUM_INOi, "inobt" }, \ - { XFS_BTNUM_FINOi, "finobt" }, \ - { XFS_BTNUM_REFCi, "refcbt" } - struct xfs_name { const unsigned char *name; int len; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 1e448d0c5aee..2c2f99d8772c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -459,7 +459,7 @@ TRACE_EVENT(xchk_btree_op_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -472,7 +472,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); @@ -480,10 +480,10 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -501,7 +501,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(int, ptr) __field(xfs_agnumber_t, agno) @@ -515,7 +515,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); @@ -523,12 +523,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -544,7 +544,7 @@ TRACE_EVENT(xchk_btree_error, TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -555,17 +555,17 @@ TRACE_EVENT(xchk_btree_error, xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level); __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -582,7 +582,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __field(xfs_ino_t, ino) __field(int, whichfork) __field(unsigned int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) @@ -595,19 +595,19 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ino = sc->ip->i_ino; __entry->whichfork = cur->bc_ino.whichfork; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->ptr, __entry->agno, @@ -622,7 +622,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, TP_STRUCT__entry( __field(dev_t, dev) __field(int, type) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, bno) __field(int, level) @@ -634,17 +634,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; ), - TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->bno, __entry->level, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e9c760fe7aa9..498b8922062a 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1888,28 +1888,28 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_near_bno); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_finish); TRACE_EVENT(xfs_alloc_cur_check, - TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno, + TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, xfs_extlen_t diff, bool new), - TP_ARGS(mp, btnum, bno, len, diff, new), + TP_ARGS(cur, bno, len, diff, new), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agblock_t, bno) __field(xfs_extlen_t, len) __field(xfs_extlen_t, diff) __field(bool, new) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->btnum = btnum; + __entry->dev = cur->bc_mp->m_super->s_dev; + __assign_str(name, cur->bc_ops->name); __entry->bno = bno; __entry->len = len; __entry->diff = diff; __entry->new = new; ), - TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d", + TP_printk("dev %d:%d %sbt agbno 0x%x fsbcount 0x%x diff 0x%x new %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->bno, __entry->len, __entry->diff, __entry->new) ) @@ -2464,7 +2464,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_ARGS(cur, level, bp), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, level) __field(int, nlevels) __field(int, ptr) @@ -2472,15 +2472,15 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_levels[level].ptr; __entry->daddr = bp ? xfs_buf_daddr(bp) : -1; ), - TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx", + TP_printk("dev %d:%d %sbt level %d/%d ptr %d daddr 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->nlevels, __entry->ptr, @@ -2502,7 +2502,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_ino_t, ino) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(int, error) __field(xfs_agblock_t, agbno) ), @@ -2515,7 +2515,7 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->agno = cur->bc_ag.pag->pag_agno; __entry->ino = 0; } - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->error = error; if (!error && stat) { if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { @@ -2532,9 +2532,9 @@ TRACE_EVENT(xfs_btree_alloc_block, __entry->agbno = NULLAGBLOCK; } ), - TP_printk("dev %d:%d btree %s agno 0x%x ino 0x%llx agbno 0x%x error %d", + TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x error %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->ino, __entry->agbno, @@ -2548,7 +2548,7 @@ TRACE_EVENT(xfs_btree_free_block, __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_ino_t, ino) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agblock_t, agbno) ), TP_fast_assign( @@ -2559,13 +2559,13 @@ TRACE_EVENT(xfs_btree_free_block, __entry->ino = cur->bc_ino.ip->i_ino; else __entry->ino = 0; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agbno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); ), - TP_printk("dev %d:%d btree %s agno 0x%x ino 0x%llx agbno 0x%x", + TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->ino, __entry->agbno) @@ -4142,7 +4142,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, TP_ARGS(cur), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(unsigned int, levels) @@ -4150,15 +4150,15 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agno = cur->bc_ag.pag->pag_agno; __entry->agbno = cur->bc_ag.afake->af_root; __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; ), - TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u", + TP_printk("dev %d:%d %sbt agno 0x%x levels %u blocks %u root %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->levels, __entry->blocks, @@ -4170,7 +4170,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, TP_ARGS(cur), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(xfs_agnumber_t, agno) __field(xfs_agino_t, agino) __field(unsigned int, levels) @@ -4179,7 +4179,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->agno = XFS_INO_TO_AGNO(cur->bc_mp, cur->bc_ino.ip->i_ino); __entry->agino = XFS_INO_TO_AGINO(cur->bc_mp, @@ -4188,9 +4188,9 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, __entry->blocks = cur->bc_ino.ifake->if_blocks; __entry->whichfork = cur->bc_ino.whichfork; ), - TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u", + TP_printk("dev %d:%d %sbt agno 0x%x agino 0x%x whichfork %s levels %u blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->agno, __entry->agino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), @@ -4207,7 +4207,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, blocks_with_extra), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(unsigned int, level) __field(unsigned int, nlevels) __field(uint64_t, nr_this_level) @@ -4218,7 +4218,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->nr_this_level = nr_this_level; @@ -4227,9 +4227,9 @@ TRACE_EVENT(xfs_btree_bload_level_geometry, __entry->blocks = blocks; __entry->blocks_with_extra = blocks_with_extra; ), - TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", + TP_printk("dev %d:%d %sbt level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->nlevels, __entry->nr_this_level, @@ -4246,7 +4246,7 @@ TRACE_EVENT(xfs_btree_bload_block, TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_btnum_t, btnum) + __string(name, cur->bc_ops->name) __field(unsigned int, level) __field(unsigned long long, block_idx) __field(unsigned long long, nr_blocks) @@ -4256,7 +4256,7 @@ TRACE_EVENT(xfs_btree_bload_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - __entry->btnum = cur->bc_btnum; + __assign_str(name, cur->bc_ops->name); __entry->level = level; __entry->block_idx = block_idx; __entry->nr_blocks = nr_blocks; @@ -4271,9 +4271,9 @@ TRACE_EVENT(xfs_btree_bload_block, } __entry->nr_records = nr_records; ), - TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u", + TP_printk("dev %d:%d %sbt level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), + __get_str(name), __entry->level, __entry->block_idx, __entry->nr_blocks, -- cgit v1.2.3 From 7f47734ad61af77a001b1e24691dcbfcb008c938 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:47 -0800 Subject: xfs: add a sick_mask to struct xfs_btree_ops Clean up xfs_btree_mark_sick by adding a sick_mask to the btree-ops for all AG-root btrees. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 3 +++ fs/xfs/libxfs/xfs_btree.h | 3 +++ fs/xfs/libxfs/xfs_ialloc_btree.c | 3 +++ fs/xfs/libxfs/xfs_refcount_btree.c | 2 ++ fs/xfs/libxfs/xfs_rmap_btree.c | 2 ++ fs/xfs/xfs_health.c | 36 +++++++++++------------------------- 6 files changed, 24 insertions(+), 25 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 262f5dc3a483..e0b0cdd8f344 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -16,6 +16,7 @@ #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_ag.h" @@ -477,6 +478,7 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .lru_refs = XFS_ALLOC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2), + .sick_mask = XFS_SICK_AG_BNOBT, .dup_cursor = xfs_allocbt_dup_cursor, .set_root = xfs_allocbt_set_root, @@ -508,6 +510,7 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .lru_refs = XFS_ALLOC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2), + .sick_mask = XFS_SICK_AG_CNTBT, .dup_cursor = xfs_allocbt_dup_cursor, .set_root = xfs_allocbt_set_root, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 6bc6096205b3..6e5fd0c06453 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -142,6 +142,9 @@ struct xfs_btree_ops { /* offset of btree stats array */ unsigned int statoff; + /* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */ + unsigned int sick_mask; + /* cursor operations */ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); void (*update_cursor)(struct xfs_btree_cur *src, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index ddb9a226914a..1fe9d83c575e 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -17,6 +17,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_rmap.h" @@ -408,6 +409,7 @@ const struct xfs_btree_ops xfs_inobt_ops = { .lru_refs = XFS_INO_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_ibt_2), + .sick_mask = XFS_SICK_AG_INOBT, .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_inobt_set_root, @@ -437,6 +439,7 @@ const struct xfs_btree_ops xfs_finobt_ops = { .lru_refs = XFS_INO_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2), + .sick_mask = XFS_SICK_AG_FINOBT, .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 16677cbbddfc..6388a0c9b691 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -16,6 +16,7 @@ #include "xfs_refcount.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_bit.h" @@ -327,6 +328,7 @@ const struct xfs_btree_ops xfs_refcountbt_ops = { .lru_refs = XFS_REFC_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2), + .sick_mask = XFS_SICK_AG_REFCNTBT, .dup_cursor = xfs_refcountbt_dup_cursor, .set_root = xfs_refcountbt_set_root, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index e1ddf814492c..abaf5e190e99 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -16,6 +16,7 @@ #include "xfs_btree_staging.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_extent_busy.h" @@ -483,6 +484,7 @@ const struct xfs_btree_ops xfs_rmapbt_ops = { .lru_refs = XFS_RMAP_BTREE_REF, .statoff = XFS_STATS_CALC_INDEX(xs_rmap_2), + .sick_mask = XFS_SICK_AG_RMAPBT, .dup_cursor = xfs_rmapbt_dup_cursor, .set_root = xfs_rmapbt_set_root, diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 855221c0dd34..9921b5d3f158 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -526,36 +526,22 @@ void xfs_btree_mark_sick( struct xfs_btree_cur *cur) { - unsigned int mask; - - switch (cur->bc_btnum) { - case XFS_BTNUM_BMAP: - xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork); + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + ASSERT(cur->bc_ops->sick_mask); + xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask); return; - case XFS_BTNUM_BNO: - mask = XFS_SICK_AG_BNOBT; - break; - case XFS_BTNUM_CNT: - mask = XFS_SICK_AG_CNTBT; - break; - case XFS_BTNUM_INO: - mask = XFS_SICK_AG_INOBT; - break; - case XFS_BTNUM_FINO: - mask = XFS_SICK_AG_FINOBT; - break; - case XFS_BTNUM_RMAP: - mask = XFS_SICK_AG_RMAPBT; - break; - case XFS_BTNUM_REFC: - mask = XFS_SICK_AG_REFCNTBT; - break; + case XFS_BTREE_TYPE_INODE: + if (cur->bc_btnum == XFS_BTNUM_BMAP) { + xfs_bmap_mark_sick(cur->bc_ino.ip, + cur->bc_ino.whichfork); + return; + } + fallthrough; default: ASSERT(0); return; } - - xfs_ag_mark_sick(cur->bc_ag.pag, mask); } /* -- cgit v1.2.3 From 48039926197522f32b548731a3b94331f0551bdc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:39:48 -0800 Subject: xfs: refactor the btree cursor allocation logic in xchk_ag_btcur_init Change xchk_ag_btcur_init to allocate all cursors first and only then check if we should delete them again because the btree is to damaged. This allows reusing the sick_mask in struct xfs_btree_ops and simplifies the code. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/common.c | 62 +++++++++++++++++++++++++++------------------------ fs/xfs/scrub/health.c | 54 +++++++++----------------------------------- fs/xfs/scrub/health.h | 4 ++-- 3 files changed, 45 insertions(+), 75 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 699092195f41..689d40578bd5 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -588,46 +588,50 @@ xchk_ag_btcur_init( { struct xfs_mount *mp = sc->mp; - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { + if (sa->agf_bp) { /* Set up a bnobt cursor for cross-referencing. */ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag, XFS_BTNUM_BNO); - } + xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, + XFS_SCRUB_TYPE_BNOBT); - if (sa->agf_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag, XFS_BTNUM_CNT); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, + XFS_SCRUB_TYPE_CNTBT); + + /* Set up a rmapbt cursor for cross-referencing. */ + if (xfs_has_rmapbt(mp)) { + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur, + XFS_SCRUB_TYPE_RMAPBT); + } + + /* Set up a refcountbt cursor for cross-referencing. */ + if (xfs_has_reflink(mp)) { + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur, + XFS_SCRUB_TYPE_REFCNTBT); + } } - /* Set up a inobt cursor for cross-referencing. */ - if (sa->agi_bp && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { + if (sa->agi_bp) { + /* Set up a inobt cursor for cross-referencing. */ sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, XFS_BTNUM_INO); - } - - /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_has_finobt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { - sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_FINO); - } - - /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_rmapbt(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { - sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag); - } - - /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_has_reflink(mp) && - xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { - sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, - sa->agf_bp, sa->pag); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, + XFS_SCRUB_TYPE_INOBT); + + /* Set up a finobt cursor for cross-referencing. */ + if (xfs_has_finobt(mp)) { + sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp, XFS_BTNUM_FINO); + xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, + XFS_SCRUB_TYPE_FINOBT); + } } } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 7878da941c12..9020a6bef7f1 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -248,13 +248,13 @@ xchk_update_health( } /* Is the given per-AG btree healthy enough for scanning? */ -bool -xchk_ag_btree_healthy_enough( +void +xchk_ag_btree_del_cursor_if_sick( struct xfs_scrub *sc, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_btree_cur **curp, + unsigned int sm_type) { - unsigned int mask = 0; + unsigned int mask = (*curp)->bc_ops->sick_mask; /* * We always want the cursor if it's the same type as whatever we're @@ -263,41 +263,8 @@ xchk_ag_btree_healthy_enough( * Otherwise, we're only interested in the btree for cross-referencing. * If we know the btree is bad then don't bother, just set XFAIL. */ - switch (btnum) { - case XFS_BTNUM_BNO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) - return true; - mask = XFS_SICK_AG_BNOBT; - break; - case XFS_BTNUM_CNT: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) - return true; - mask = XFS_SICK_AG_CNTBT; - break; - case XFS_BTNUM_INO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) - return true; - mask = XFS_SICK_AG_INOBT; - break; - case XFS_BTNUM_FINO: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) - return true; - mask = XFS_SICK_AG_FINOBT; - break; - case XFS_BTNUM_RMAP: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) - return true; - mask = XFS_SICK_AG_RMAPBT; - break; - case XFS_BTNUM_REFC: - if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) - return true; - mask = XFS_SICK_AG_REFCNTBT; - break; - default: - ASSERT(0); - return true; - } + if (sc->sm->sm_type == sm_type) + return; /* * If we just repaired some AG metadata, sc->sick_mask will reflect all @@ -309,12 +276,11 @@ xchk_ag_btree_healthy_enough( type_to_health_flag[sc->sm->sm_type].group == XHG_AG) mask &= ~sc->sick_mask; - if (xfs_ag_has_sickness(pag, mask)) { + if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; - return false; + xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR); + *curp = NULL; } - - return true; } /* diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index 06d17941776c..63fc426eb5ae 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -8,8 +8,8 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); -bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, - xfs_btnum_t btnum); +void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc, + struct xfs_btree_cur **curp, unsigned int sm_type); void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); int xchk_health_record(struct xfs_scrub *sc); -- cgit v1.2.3 From 1c8b9fd278c08e16c27a41be484b77383738de1f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:12 -0800 Subject: xfs: split xfs_allocbt_init_cursor Split xfs_allocbt_init_cursor into separate routines for the by-bno and by-cnt btrees to prepare for the removal of the xfs_btnum global enumeration of btree types. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 36 ++++++++++++------------ fs/xfs/libxfs/xfs_alloc_btree.c | 62 +++++++++++++++++++++++++++++------------ fs/xfs/libxfs/xfs_alloc_btree.h | 7 +++-- fs/xfs/scrub/agheader_repair.c | 12 +++----- fs/xfs/scrub/alloc_repair.c | 6 ++-- fs/xfs/scrub/common.c | 8 +++--- fs/xfs/scrub/repair.c | 8 +++--- fs/xfs/scrub/rmap.c | 8 +++--- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_fsmap.c | 4 +-- 10 files changed, 88 insertions(+), 65 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 44d4f0da90ba..2b74aded4a2c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -862,8 +862,8 @@ xfs_alloc_cur_setup( * attempt a small allocation. */ if (!acur->cnt) - acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_CNT); + acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); if (error) return error; @@ -872,11 +872,11 @@ xfs_alloc_cur_setup( * Allocate the bnobt left and right search cursors. */ if (!acur->bnolt) - acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_BNO); + acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); if (!acur->bnogt) - acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, - args->agbp, args->pag, XFS_BTNUM_BNO); + acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp, + args->agbp, args->pag); return i == 1 ? 0 : -ENOSPC; } @@ -1234,8 +1234,8 @@ xfs_alloc_ag_vextent_exact( /* * Allocate/initialize a cursor for the by-number freespace btree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). @@ -1295,8 +1295,8 @@ xfs_alloc_ag_vextent_exact( * We are allocating agbno for args->len * Allocate/initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, args->len, XFSA_FIXUP_BNO_OK); @@ -1710,8 +1710,8 @@ restart: /* * Allocate and initialize a cursor for the by-size btree. */ - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); bno_cur = NULL; /* @@ -1896,8 +1896,8 @@ restart: /* * Allocate and initialize a cursor for the by-block tree. */ - bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, - args->pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, + args->pag); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1971,7 +1971,7 @@ xfs_free_ag_extent( /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); /* * Look for a neighboring block on the left (lower block numbers) * that is contiguous with this space. @@ -2045,7 +2045,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -2754,8 +2754,8 @@ xfs_exact_minlen_extent_available( xfs_extlen_t flen; int error = 0; - cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, - args->pag, XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp, + args->pag); error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index e0b0cdd8f344..722863464289 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -24,13 +24,22 @@ static struct kmem_cache *xfs_allocbt_cur_cache; STATIC struct xfs_btree_cur * -xfs_allocbt_dup_cursor( +xfs_bnobt_dup_cursor( struct xfs_btree_cur *cur) { - return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum); + return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, + cur->bc_ag.pag); } +STATIC struct xfs_btree_cur * +xfs_cntbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp, + cur->bc_ag.pag); +} + + STATIC void xfs_allocbt_set_root( struct xfs_btree_cur *cur, @@ -480,7 +489,7 @@ const struct xfs_btree_ops xfs_bnobt_ops = { .statoff = XFS_STATS_CALC_INDEX(xs_abtb_2), .sick_mask = XFS_SICK_AG_BNOBT, - .dup_cursor = xfs_allocbt_dup_cursor, + .dup_cursor = xfs_bnobt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, @@ -512,7 +521,7 @@ const struct xfs_btree_ops xfs_cntbt_ops = { .statoff = XFS_STATS_CALC_INDEX(xs_abtc_2), .sick_mask = XFS_SICK_AG_CNTBT, - .dup_cursor = xfs_allocbt_dup_cursor, + .dup_cursor = xfs_cntbt_dup_cursor, .set_root = xfs_allocbt_set_root, .alloc_block = xfs_allocbt_alloc_block, .free_block = xfs_allocbt_free_block, @@ -532,36 +541,53 @@ const struct xfs_btree_ops xfs_cntbt_ops = { }; /* - * Allocate a new allocation btree cursor. + * Allocate a new bnobt cursor. * * For staging cursors tp and agbp are NULL. */ struct xfs_btree_cur * -xfs_allocbt_init_cursor( +xfs_bnobt_init_cursor( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *agbp, - struct xfs_perag *pag, - xfs_btnum_t btnum) + struct xfs_perag *pag) { - const struct xfs_btree_ops *ops = &xfs_bnobt_ops; struct xfs_btree_cur *cur; - ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BNO, &xfs_bnobt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); + cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agf *agf = agbp->b_addr; + + cur->bc_nlevels = be32_to_cpu(agf->agf_bno_level); + } + return cur; +} - if (btnum == XFS_BTNUM_CNT) - ops = &xfs_cntbt_ops; +/* + * Allocate a new cntbt cursor. + * + * For staging cursors tp and agbp are NULL. + */ +struct xfs_btree_cur * +xfs_cntbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, btnum, ops, mp->m_alloc_maxlevels, - xfs_allocbt_cur_cache); + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_CNT, &xfs_cntbt_ops, + mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agf *agf = agbp->b_addr; - cur->bc_nlevels = (btnum == XFS_BTNUM_BNO) ? - be32_to_cpu(agf->agf_bno_level) : - be32_to_cpu(agf->agf_cnt_level); + cur->bc_nlevels = be32_to_cpu(agf->agf_cnt_level); } return cur; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 1c910862535f..155b47f231ab 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -47,9 +47,12 @@ struct xbtree_afakeroot; (maxrecs) * sizeof(xfs_alloc_key_t) + \ ((index) - 1) * sizeof(xfs_alloc_ptr_t))) -extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp, +struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_perag *pag, xfs_btnum_t btnum); + struct xfs_perag *pag); +struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_perag *pag); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index e4dd4fe84c5f..e2374d05bdd1 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -255,8 +255,7 @@ xrep_agf_calc_from_btrees( int error; /* Update the AGF counters from the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); if (error) goto err; @@ -269,8 +268,7 @@ xrep_agf_calc_from_btrees( agf->agf_longest = cpu_to_be32(raa.longest); /* Update the AGF counters from the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; @@ -549,16 +547,14 @@ xrep_agfl_collect_blocks( goto out_bmp; /* Find all blocks currently being used by the bnobt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) goto out_bmp; /* Find all blocks currently being used by the cntbt. */ - cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur); xfs_btree_del_cursor(cur, error); if (error) diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c index 0ef27aacbf25..d421b253923e 100644 --- a/fs/xfs/scrub/alloc_repair.c +++ b/fs/xfs/scrub/alloc_repair.c @@ -735,12 +735,10 @@ xrep_abt_build_new_trees( ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; /* Allocate cursors for the staged btrees. */ - bno_cur = xfs_allocbt_init_cursor(sc->mp, NULL, NULL, pag, - XFS_BTNUM_BNO); + bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag); xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake); - cnt_cur = xfs_allocbt_init_cursor(sc->mp, NULL, NULL, pag, - XFS_BTNUM_CNT); + cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag); xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake); /* Last chance to abort before we start committing fixes. */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 689d40578bd5..1233a5604c72 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -590,14 +590,14 @@ xchk_ag_btcur_init( if (sa->agf_bp) { /* Set up a bnobt cursor for cross-referencing. */ - sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_BNO); + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag); xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur, XFS_SCRUB_TYPE_BNOBT); /* Set up a cntbt cursor for cross-referencing. */ - sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sa->pag, XFS_BTNUM_CNT); + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, + sa->pag); xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur, XFS_SCRUB_TYPE_CNTBT); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index ab510cea96d8..078d21598db5 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -832,10 +832,10 @@ xrep_ag_btcur_init( /* Set up a bnobt cursor for cross-referencing. */ if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { - sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); - sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); } /* Set up a inobt cursor for cross-referencing. */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index c99d1714f283..e0550e018584 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -412,8 +412,8 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */ cur = sc->sa.bno_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_BNO); + cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.bno_cur) xfs_btree_del_cursor(cur, error); @@ -422,8 +422,8 @@ xchk_rmapbt_walk_ag_metadata( cur = sc->sa.cnt_cur; if (!cur) - cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur); if (cur != sc->sa.cnt_cur) xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index f0bf1cc98548..268bb734dc0a 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -179,7 +179,7 @@ xfs_trim_gather_extents( if (error) goto out_trans_cancel; - cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); + cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); /* * Look up the extent length requested in the AGF and start with it. diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 5a72217f5feb..de59eec74765 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -763,8 +763,8 @@ xfs_getfsmap_datadev_bnobt_query( return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info); /* Allocate cursor for this AG and query_range it. */ - *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, - info->pag, XFS_BTNUM_BNO); + *curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->pag); key->ar_startblock = info->low.rm_startblock; key[1].ar_startblock = info->high.rm_startblock; return xfs_alloc_query_range(*curpp, key, &key[1], -- cgit v1.2.3 From 3038fd8129384c64946c17198229ee61f6f2c8e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:46 -0800 Subject: xfs: remove xfs_inobt_cur This helper provides no real advantage over just open code the two calls in it in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc_btree.c | 29 +++-------------------------- fs/xfs/libxfs/xfs_ialloc_btree.h | 3 --- fs/xfs/xfs_iwalk.c | 9 +++++---- 3 files changed, 8 insertions(+), 33 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 1fe9d83c575e..441c5a7be1e0 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -710,30 +710,6 @@ xfs_inobt_max_size( XFS_INODES_PER_CHUNK); } -/* Read AGI and create inobt cursor. */ -int -xfs_inobt_cur( - struct xfs_perag *pag, - struct xfs_trans *tp, - xfs_btnum_t which, - struct xfs_btree_cur **curpp, - struct xfs_buf **agi_bpp) -{ - struct xfs_btree_cur *cur; - int error; - - ASSERT(*agi_bpp == NULL); - ASSERT(*curpp == NULL); - - error = xfs_ialloc_read_agi(pag, tp, agi_bpp); - if (error) - return error; - - cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which); - *curpp = cur; - return 0; -} - static int xfs_inobt_count_blocks( struct xfs_perag *pag, @@ -742,13 +718,14 @@ xfs_inobt_count_blocks( xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp = NULL; - struct xfs_btree_cur *cur = NULL; + struct xfs_btree_cur *cur; int error; - error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) return error; + cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 40f0fc0e8da3..2f1552d65655 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -64,9 +64,6 @@ int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp, xfs_extlen_t *ask, xfs_extlen_t *used); extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, unsigned long long len); -int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp, - xfs_btnum_t btnum, struct xfs_btree_cur **curpp, - struct xfs_buf **agi_bpp); void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index dc3d83d7dee1..38092c049c08 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -266,9 +266,10 @@ xfs_iwalk_ag_start( /* Set up a fresh cursor and empty the inobt cache. */ iwag->nr_recs = 0; - error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); if (error) return error; + *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp, XFS_BTNUM_INO); /* Starting at the beginning of the AG? That's easy! */ if (agino == 0) @@ -383,11 +384,11 @@ xfs_iwalk_run_callbacks( } /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp, - agi_bpp); + error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp); if (error) return error; - + *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp, + XFS_BTNUM_INO); return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); } -- cgit v1.2.3 From 4bfb028a4c00d0a079a625d7867325efb3c37de2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:47 -0800 Subject: xfs: remove the btnum argument to xfs_inobt_count_blocks xfs_inobt_count_blocks is only used for the finobt. Hardcode the btnum argument and rename the function to match that. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc_btree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 441c5a7be1e0..c920aee4a7da 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -711,10 +711,9 @@ xfs_inobt_max_size( } static int -xfs_inobt_count_blocks( +xfs_finobt_count_blocks( struct xfs_perag *pag, struct xfs_trans *tp, - xfs_btnum_t btnum, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp = NULL; @@ -725,7 +724,7 @@ xfs_inobt_count_blocks( if (error) return error; - cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); @@ -773,8 +772,7 @@ xfs_finobt_calc_reserves( if (xfs_has_inobtcounts(pag->pag_mount)) error = xfs_finobt_read_blocks(pag, tp, &tree_len); else - error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO, - &tree_len); + error = xfs_finobt_count_blocks(pag, tp, &tree_len); if (error) return error; -- cgit v1.2.3 From c81a01a74a6769569650f4e42203ce3147b24f0a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:48 -0800 Subject: xfs: remove the which variable in xchk_iallocbt The which variable that holds a btree number is passed to two functions that ignore it and used in a single check that can check the sm_type as well. Remove it to unclutter the code. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/ialloc.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index a720fc62262a..26d589e9ba1c 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -649,8 +649,7 @@ out: */ STATIC void xchk_iallocbt_xref_rmap_btreeblks( - struct xfs_scrub *sc, - int which) + struct xfs_scrub *sc) { xfs_filblks_t blocks; xfs_extlen_t inobt_blocks = 0; @@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks( STATIC void xchk_iallocbt_xref_rmap_inodes( struct xfs_scrub *sc, - int which, unsigned long long inodes) { xfs_filblks_t blocks; @@ -719,17 +717,14 @@ xchk_iallocbt( .next_startino = NULLAGINO, .next_cluster_ino = NULLAGINO, }; - xfs_btnum_t which; int error; switch (sc->sm->sm_type) { case XFS_SCRUB_TYPE_INOBT: cur = sc->sa.ino_cur; - which = XFS_BTNUM_INO; break; case XFS_SCRUB_TYPE_FINOBT: cur = sc->sa.fino_cur; - which = XFS_BTNUM_FINO; break; default: ASSERT(0); @@ -741,7 +736,7 @@ xchk_iallocbt( if (error) return error; - xchk_iallocbt_xref_rmap_btreeblks(sc, which); + xchk_iallocbt_xref_rmap_btreeblks(sc); /* * If we're scrubbing the inode btree, inode_blocks is the number of @@ -750,9 +745,8 @@ xchk_iallocbt( * knows about. We can't do this for the finobt since it only points * to inode chunks with free inodes. */ - if (which == XFS_BTNUM_INO) - xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes); - + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes); return error; } -- cgit v1.2.3 From 8541a7d9da2dd6e44f401f2363b21749b7413fc9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:48 -0800 Subject: xfs: split xfs_inobt_insert_sprec Split the finobt version that never merges and uses a different cursor out of xfs_inobt_insert_sprec to prepare for removing xfs_btnum_t. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 148 +++++++++++++++++++++++++++++---------------- 1 file changed, 96 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 52b82cd7e6c9..17a53f635c9f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -529,16 +529,14 @@ __xfs_inobt_rec_merge( } /* - * Insert a new sparse inode chunk into the associated inode btree. The inode - * record for the sparse chunk is pre-aligned to a startino that should match - * any pre-existing sparse inode record in the tree. This allows sparse chunks - * to fill over time. + * Insert a new sparse inode chunk into the associated inode allocation btree. + * The inode record for the sparse chunk is pre-aligned to a startino that + * should match any pre-existing sparse inode record in the tree. This allows + * sparse chunks to fill over time. * - * This function supports two modes of handling preexisting records depending on - * the merge flag. If merge is true, the provided record is merged with the + * If no preexisting record exists, the provided record is inserted. + * If there is a preexisting record, the provided record is merged with the * existing record and updated in place. The merged record is returned in nrec. - * If merge is false, an existing record is replaced with the provided record. - * If no preexisting record exists, the provided record is always inserted. * * It is considered corruption if a merge is requested and not possible. Given * the sparse inode alignment constraints, this should never happen. @@ -548,9 +546,7 @@ xfs_inobt_insert_sprec( struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, - int btnum, - struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ - bool merge) /* merge or replace */ + struct xfs_inobt_rec_incore *nrec) /* in/out: new/merged rec. */ { struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; @@ -558,7 +554,7 @@ xfs_inobt_insert_sprec( int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -581,48 +577,45 @@ xfs_inobt_insert_sprec( } /* - * A record exists at this startino. Merge or replace the record - * depending on what we've been asked to do. + * A record exists at this startino. Merge the records. */ - if (merge) { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error; - if (XFS_IS_CORRUPT(mp, i != 1)) { - xfs_btree_mark_sick(cur); - error = -EFSCORRUPTED; - goto error; - } - if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { - xfs_btree_mark_sick(cur); - error = -EFSCORRUPTED; - goto error; - } + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - /* - * This should never fail. If we have coexisting records that - * cannot merge, something is seriously wrong. - */ - if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { - xfs_btree_mark_sick(cur); - error = -EFSCORRUPTED; - goto error; - } + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } - trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, - rec.ir_holemask, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); - /* merge to nrec to output the updated record */ - __xfs_inobt_rec_merge(nrec, &rec); + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); - trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, - nrec->ir_holemask); + trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino, + nrec->ir_holemask); - error = xfs_inobt_rec_check_count(mp, nrec); - if (error) - goto error; - } + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; error = xfs_inobt_update(cur, nrec); if (error) @@ -636,6 +629,59 @@ error: return error; } +/* + * Insert a new sparse inode chunk into the free inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * The new record is always inserted, overwriting a pre-existing record if + * there is one. + */ +STATIC int +xfs_finobt_insert_sprec( + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_inobt_rec_incore *nrec) /* in/out: new rec. */ +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_btree_cur *cur; + int error; + int i; + + cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + if (XFS_IS_CORRUPT(mp, i != 1)) { + xfs_btree_mark_sick(cur); + error = -EFSCORRUPTED; + goto error; + } + } else { + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + + /* * Allocate new inodes in the allocation group specified by agbp. Returns 0 if * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so @@ -862,8 +908,7 @@ sparse_alloc: * if necessary. If a merge does occur, rec is updated to the * merged record. */ - error = xfs_inobt_insert_sprec(pag, tp, agbp, - XFS_BTNUM_INO, &rec, true); + error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec); if (error == -EFSCORRUPTED) { xfs_alert(args.mp, "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", @@ -887,8 +932,7 @@ sparse_alloc: * existing record with this one. */ if (xfs_has_finobt(args.mp)) { - error = xfs_inobt_insert_sprec(pag, tp, agbp, - XFS_BTNUM_FINO, &rec, false); + error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec); if (error) return error; } -- cgit v1.2.3 From 14dd46cf31f4aaffcf26b00de9af39d01ec8d547 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:49 -0800 Subject: xfs: split xfs_inobt_init_cursor Split xfs_inobt_init_cursor into separate routines for the inobt and finobt to prepare for the removal of the xfs_btnum global enumeration of btree types. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 23 ++++++++++-------- fs/xfs/libxfs/xfs_ialloc_btree.c | 51 ++++++++++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_ialloc_btree.h | 6 +++-- fs/xfs/scrub/agheader_repair.c | 5 ++-- fs/xfs/scrub/common.c | 8 +++---- fs/xfs/scrub/ialloc_repair.c | 5 ++-- fs/xfs/scrub/iscan.c | 2 +- fs/xfs/scrub/repair.c | 6 ++--- fs/xfs/scrub/rmap.c | 7 +++--- fs/xfs/xfs_iwalk.c | 5 ++-- 10 files changed, 72 insertions(+), 46 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 17a53f635c9f..03af7a729980 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -213,7 +213,10 @@ xfs_inobt_insert( int i; int error; - cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum); + if (btnum == XFS_BTNUM_FINO) + cur = xfs_finobt_init_cursor(pag, tp, agbp); + else + cur = xfs_inobt_init_cursor(pag, tp, agbp); for (thisino = newino; thisino < newino + newlen; @@ -554,7 +557,7 @@ xfs_inobt_insert_sprec( int i; struct xfs_inobt_rec_incore rec; - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -650,7 +653,7 @@ xfs_finobt_insert_sprec( int error; int i; - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); /* the new record is pre-aligned so we know where to look */ error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); @@ -1083,7 +1086,7 @@ xfs_dialloc_ag_inobt( ASSERT(pag->pagi_freecount > 0); restart_pagno: - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. @@ -1557,7 +1560,7 @@ xfs_dialloc_ag( if (!pagino) pagino = be32_to_cpu(agi->agi_newino); - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -1600,7 +1603,7 @@ xfs_dialloc_ag( * the original freecount. If all is well, make the equivalent update to * the inobt using the finobt record and offset information. */ - icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + icur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(icur); if (error) @@ -2017,7 +2020,7 @@ xfs_difree_inobt( /* * Initialize the cursor. */ - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_check_agi_freecount(cur); if (error) @@ -2144,7 +2147,7 @@ xfs_difree_finobt( int error; int i; - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) @@ -2344,7 +2347,7 @@ xfs_imap_lookup( * we have a record, we need to ensure it contains the inode number * we are looking up. */ - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) @@ -3063,7 +3066,7 @@ xfs_ialloc_check_shrink( if (!xfs_has_sparseinodes(pag->pag_mount)) return 0; - cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agibp); /* Look up the inobt record that would correspond to the new EOFS. */ agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index c920aee4a7da..9cb5da9be904 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -38,7 +38,15 @@ xfs_inobt_dup_cursor( struct xfs_btree_cur *cur) { return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, - cur->bc_ag.agbp, cur->bc_btnum); + cur->bc_ag.agbp); +} + +STATIC struct xfs_btree_cur * +xfs_finobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp); } STATIC void @@ -441,7 +449,7 @@ const struct xfs_btree_ops xfs_finobt_ops = { .statoff = XFS_STATS_CALC_INDEX(xs_fibt_2), .sick_mask = XFS_SICK_AG_FINOBT, - .dup_cursor = xfs_inobt_dup_cursor, + .dup_cursor = xfs_finobt_dup_cursor, .set_root = xfs_finobt_set_root, .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_finobt_free_block, @@ -468,28 +476,45 @@ struct xfs_btree_cur * xfs_inobt_init_cursor( struct xfs_perag *pag, struct xfs_trans *tp, - struct xfs_buf *agbp, - xfs_btnum_t btnum) /* ialloc or free ino btree */ + struct xfs_buf *agbp) { struct xfs_mount *mp = pag->pag_mount; - const struct xfs_btree_ops *ops = &xfs_inobt_ops; struct xfs_btree_cur *cur; - ASSERT(btnum == XFS_BTNUM_INO || btnum == XFS_BTNUM_FINO); + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_INO, &xfs_inobt_ops, + M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); + cur->bc_ag.pag = xfs_perag_hold(pag); + cur->bc_ag.agbp = agbp; + if (agbp) { + struct xfs_agi *agi = agbp->b_addr; - if (btnum == XFS_BTNUM_FINO) - ops = &xfs_finobt_ops; + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + } + return cur; +} + +/* + * Create a free inode btree cursor. + * + * For staging cursors tp and agbp are NULL. + */ +struct xfs_btree_cur * +xfs_finobt_init_cursor( + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf *agbp) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, btnum, ops, + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_FINO, &xfs_finobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; if (agbp) { struct xfs_agi *agi = agbp->b_addr; - cur->bc_nlevels = (btnum == XFS_BTNUM_INO) ? - be32_to_cpu(agi->agi_level) : - be32_to_cpu(agi->agi_free_level); + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); } return cur; } @@ -724,7 +749,7 @@ xfs_finobt_count_blocks( if (error) return error; - cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO); + cur = xfs_inobt_init_cursor(pag, tp, agbp); error = xfs_btree_count_blocks(cur, tree_blocks); xfs_btree_del_cursor(cur, error); xfs_trans_brelse(tp, agbp); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 2f1552d65655..6472ec1ecbb4 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -46,8 +46,10 @@ struct xfs_perag; (maxrecs) * sizeof(xfs_inobt_key_t) + \ ((index) - 1) * sizeof(xfs_inobt_ptr_t))) -extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, - struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum); +struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp); +struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf *agbp); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); /* ir_holemask to inode allocation bitmap conversion */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index e2374d05bdd1..427054b65b23 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -894,7 +894,7 @@ xrep_agi_calc_from_btrees( xfs_agino_t freecount; int error; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; @@ -914,8 +914,7 @@ xrep_agi_calc_from_btrees( if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, - XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp); error = xfs_btree_count_blocks(cur, &blocks); if (error) goto err; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 1233a5604c72..70746a7db954 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -620,15 +620,15 @@ xchk_ag_btcur_init( if (sa->agi_bp) { /* Set up a inobt cursor for cross-referencing. */ - sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp, - XFS_BTNUM_INO); + sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur, XFS_SCRUB_TYPE_INOBT); /* Set up a finobt cursor for cross-referencing. */ if (xfs_has_finobt(mp)) { - sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, - sa->agi_bp, XFS_BTNUM_FINO); + sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp, + sa->agi_bp); xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur, XFS_SCRUB_TYPE_FINOBT); } diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c index 04e186d8c738..a00ec7ae1792 100644 --- a/fs/xfs/scrub/ialloc_repair.c +++ b/fs/xfs/scrub/ialloc_repair.c @@ -663,7 +663,7 @@ xrep_ibt_build_new_trees( ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; ri->new_inobt.bload.get_records = xrep_ibt_get_records; - ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL, XFS_BTNUM_INO); + ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL); xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake); error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, xfarray_length(ri->inode_records)); @@ -684,8 +684,7 @@ xrep_ibt_build_new_trees( ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; ri->new_finobt.bload.get_records = xrep_fibt_get_records; - fino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL, - XFS_BTNUM_FINO); + fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL); xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake); error = xfs_btree_bload_compute_geometry(fino_cur, &ri->new_finobt.bload, ri->finobt_recs); diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c index 17af89b519b3..ec3478bc505e 100644 --- a/fs/xfs/scrub/iscan.c +++ b/fs/xfs/scrub/iscan.c @@ -113,7 +113,7 @@ xchk_iscan_find_next( * Look up the inode chunk for the current cursor position. If there * is no chunk here, we want the next one. */ - cur = xfs_inobt_init_cursor(pag, tp, agi_bp, XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(pag, tp, agi_bp); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec); if (!error && !has_rec) error = xfs_btree_increment(cur, 0, &has_rec); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 078d21598db5..d1a21f380abe 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -842,10 +842,10 @@ xrep_ag_btcur_init( if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, - sa->agi_bp, XFS_BTNUM_INO); + sa->agi_bp); if (xfs_has_finobt(mp)) - sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag, - sc->tp, sa->agi_bp, XFS_BTNUM_FINO); + sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp); } /* Set up a rmapbt cursor for cross-referencing. */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index e0550e018584..5afe6650ed6c 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -447,8 +447,7 @@ xchk_rmapbt_walk_ag_metadata( /* OWN_INOBT: inobt, finobt */ cur = sc->sa.ino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp, - XFS_BTNUM_INO); + cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.ino_cur) xfs_btree_del_cursor(cur, error); @@ -458,8 +457,8 @@ xchk_rmapbt_walk_ag_metadata( if (xfs_has_finobt(sc->mp)) { cur = sc->sa.fino_cur; if (!cur) - cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, - sc->sa.agi_bp, XFS_BTNUM_FINO); + cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, + sc->sa.agi_bp); error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur); if (cur != sc->sa.fino_cur) xfs_btree_del_cursor(cur, error); diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 38092c049c08..b6a7751e7c36 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -269,7 +269,7 @@ xfs_iwalk_ag_start( error = xfs_ialloc_read_agi(pag, tp, agi_bpp); if (error) return error; - *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp, XFS_BTNUM_INO); + *curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp); /* Starting at the beginning of the AG? That's easy! */ if (agino == 0) @@ -387,8 +387,7 @@ xfs_iwalk_run_callbacks( error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp); if (error) return error; - *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp, - XFS_BTNUM_INO); + *curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp); return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); } -- cgit v1.2.3 From fbeef4e061ab28bf556af4ee2a5a9848dc4616c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:50 -0800 Subject: xfs: pass a 'bool is_finobt' to xfs_inobt_insert This is one of the last users of xfs_btnum_t and can only designate either the inobt or finobt. Replace it with a simple bool. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 03af7a729980..e6decc37ff18 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -206,14 +206,14 @@ xfs_inobt_insert( struct xfs_buf *agbp, xfs_agino_t newino, xfs_agino_t newlen, - xfs_btnum_t btnum) + bool is_finobt) { struct xfs_btree_cur *cur; xfs_agino_t thisino; int i; int error; - if (btnum == XFS_BTNUM_FINO) + if (is_finobt) cur = xfs_finobt_init_cursor(pag, tp, agbp); else cur = xfs_inobt_init_cursor(pag, tp, agbp); @@ -941,14 +941,13 @@ sparse_alloc: } } else { /* full chunk - insert new records to both btrees */ - error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, - XFS_BTNUM_INO); + error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false); if (error) return error; if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(pag, tp, agbp, newino, - newlen, XFS_BTNUM_FINO); + newlen, true); if (error) return error; } -- cgit v1.2.3 From ec793e690f801d97a7ae2a0d429fea1fee4d44aa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:51 -0800 Subject: xfs: remove xfs_btnum_t The last checks for bc_btnum can be replaced with helpers that check the btree ops. This allows adding new btrees to XFS without having to update a global enum. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong [djwong: complete the ops predicates] Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 6 +++--- fs/xfs/libxfs/xfs_alloc_btree.c | 12 ++++++------ fs/xfs/libxfs/xfs_bmap_btree.c | 4 ++-- fs/xfs/libxfs/xfs_btree.c | 4 ++-- fs/xfs/libxfs/xfs_btree.h | 11 ----------- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/libxfs/xfs_ialloc_btree.c | 10 +++++----- fs/xfs/libxfs/xfs_refcount_btree.c | 5 ++--- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- fs/xfs/libxfs/xfs_shared.h | 35 +++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_types.h | 9 --------- fs/xfs/scrub/btree.c | 10 ++++------ fs/xfs/scrub/ialloc.c | 6 +++--- fs/xfs/scrub/trace.h | 8 -------- fs/xfs/xfs_health.c | 2 +- fs/xfs/xfs_trace.h | 9 --------- 16 files changed, 65 insertions(+), 70 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 2b74aded4a2c..9da52e92172a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -918,7 +918,7 @@ xfs_alloc_cur_check( bool busy; unsigned busy_gen = 0; bool deactivate = false; - bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; + bool isbnobt = xfs_btree_is_bno(cur->bc_ops); *new = 0; @@ -4026,7 +4026,7 @@ xfs_alloc_query_range( union xfs_btree_irec high_brec = { .a = *high_rec }; struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + ASSERT(xfs_btree_is_bno(cur->bc_ops)); return xfs_btree_query_range(cur, &low_brec, &high_brec, xfs_alloc_query_range_helper, &query); } @@ -4040,7 +4040,7 @@ xfs_alloc_query_all( { struct xfs_alloc_query_range_info query; - ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + ASSERT(xfs_btree_is_bno(cur->bc_ops)); query.priv = priv; query.fn = fn; return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 722863464289..885c7db5d6e7 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -51,7 +51,7 @@ xfs_allocbt_set_root( ASSERT(ptr->s != 0); - if (cur->bc_btnum == XFS_BTNUM_BNO) { + if (xfs_btree_is_bno(cur->bc_ops)) { agf->agf_bno_root = ptr->s; be32_add_cpu(&agf->agf_bno_level, inc); cur->bc_ag.pag->pagf_bno_level += inc; @@ -131,7 +131,7 @@ xfs_allocbt_update_lastrec( __be32 len; int numrecs; - ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); + ASSERT(!xfs_btree_is_bno(cur->bc_ops)); switch (reason) { case LASTREC_UPDATE: @@ -241,7 +241,7 @@ xfs_allocbt_init_ptr_from_cur( ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno)); - if (cur->bc_btnum == XFS_BTNUM_BNO) + if (xfs_btree_is_bno(cur->bc_ops)) ptr->s = agf->agf_bno_root; else ptr->s = agf->agf_cnt_root; @@ -554,7 +554,7 @@ xfs_bnobt_init_cursor( { struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BNO, &xfs_bnobt_ops, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; @@ -580,7 +580,7 @@ xfs_cntbt_init_cursor( { struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_CNT, &xfs_cntbt_ops, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops, mp->m_alloc_maxlevels, xfs_allocbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; @@ -607,7 +607,7 @@ xfs_allocbt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - if (cur->bc_btnum == XFS_BTNUM_BNO) { + if (xfs_btree_is_bno(cur->bc_ops)) { agf->agf_bno_root = cpu_to_be32(afake->af_root); agf->agf_bno_level = cpu_to_be32(afake->af_levels); } else { diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 25193551e95b..54fdf0df8ec3 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -574,8 +574,8 @@ xfs_bmbt_init_cursor( maxlevels = mp->m_bm_maxlevels[whichfork]; break; } - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, &xfs_bmbt_ops, - maxlevels, xfs_bmbt_cur_cache); + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bmbt_ops, maxlevels, + xfs_bmbt_cur_cache); cur->bc_ino.ip = ip; cur->bc_ino.whichfork = whichfork; cur->bc_bmap.allocated = 0; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 278461d0f64d..769be61ad63f 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -454,7 +454,7 @@ xfs_btree_del_cursor( * zero, then we should be shut down or on our way to shutdown due to * cancelling a dirty transaction on error. */ - ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_bmap.allocated == 0 || + ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 || xfs_is_shutdown(cur->bc_mp) || error != 0); switch (cur->bc_ops->type) { @@ -3016,7 +3016,7 @@ xfs_btree_split( struct xfs_btree_split_args args; DECLARE_COMPLETION_ONSTACK(done); - if (cur->bc_btnum != XFS_BTNUM_BMAP || + if (!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_tp->t_highest_agno == NULLAGNUMBER) return __xfs_btree_split(cur, level, ptrp, key, curp, stat); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 6e5fd0c06453..9a264ffee303 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -55,14 +55,6 @@ union xfs_btree_rec { #define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi) #define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi) -#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi) -#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) -#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) -#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) -#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) -#define XFS_BTNUM_RMAP ((xfs_btnum_t)XFS_BTNUM_RMAPi) -#define XFS_BTNUM_REFC ((xfs_btnum_t)XFS_BTNUM_REFCi) - struct xfs_btree_ops; uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops); @@ -272,7 +264,6 @@ struct xfs_btree_cur const struct xfs_btree_ops *bc_ops; struct kmem_cache *bc_cache; /* cursor cache */ unsigned int bc_flags; /* btree features - below */ - xfs_btnum_t bc_btnum; /* identifies which btree type */ union xfs_btree_irec bc_rec; /* current insert/search record value */ uint8_t bc_nlevels; /* number of levels in the tree */ uint8_t bc_maxlevels; /* maximum levels for this btree type */ @@ -726,7 +717,6 @@ static inline struct xfs_btree_cur * xfs_btree_alloc_cursor( struct xfs_mount *mp, struct xfs_trans *tp, - xfs_btnum_t btnum, const struct xfs_btree_ops *ops, uint8_t maxlevels, struct kmem_cache *cache) @@ -742,7 +732,6 @@ xfs_btree_alloc_cursor( cur->bc_ops = ops; cur->bc_tp = tp; cur->bc_mp = mp; - cur->bc_btnum = btnum; cur->bc_maxlevels = maxlevels; cur->bc_cache = cache; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index e6decc37ff18..e5ac3e5430c4 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2848,7 +2848,7 @@ xfs_ialloc_count_inodes( struct xfs_ialloc_count_inodes ci = {0}; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9cb5da9be904..74f144b2db68 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -90,9 +90,9 @@ xfs_inobt_mod_blockcount( if (!xfs_has_inobtcounts(cur->bc_mp)) return; - if (cur->bc_btnum == XFS_BTNUM_FINO) + if (xfs_btree_is_fino(cur->bc_ops)) be32_add_cpu(&agi->agi_fblocks, howmuch); - else if (cur->bc_btnum == XFS_BTNUM_INO) + else be32_add_cpu(&agi->agi_iblocks, howmuch); xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS); } @@ -481,7 +481,7 @@ xfs_inobt_init_cursor( struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_INO, &xfs_inobt_ops, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; @@ -507,7 +507,7 @@ xfs_finobt_init_cursor( struct xfs_mount *mp = pag->pag_mount; struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_FINO, &xfs_finobt_ops, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops, M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; @@ -535,7 +535,7 @@ xfs_inobt_commit_staged_btree( ASSERT(cur->bc_flags & XFS_BTREE_STAGING); - if (cur->bc_btnum == XFS_BTNUM_INO) { + if (xfs_btree_is_ino(cur->bc_ops)) { fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 6388a0c9b691..f93dae3db701 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -364,9 +364,8 @@ xfs_refcountbt_init_cursor( ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC, - &xfs_refcountbt_ops, mp->m_refc_maxlevels, - xfs_refcountbt_cur_cache); + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops, + mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_refc.nr_ops = 0; cur->bc_refc.shape_changes = 0; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index abaf5e190e99..b1ecc061fdc9 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -518,7 +518,7 @@ xfs_rmapbt_init_cursor( { struct xfs_btree_cur *cur; - cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, &xfs_rmapbt_ops, + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops, mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); cur->bc_ag.pag = xfs_perag_hold(pag); cur->bc_ag.agbp = agbp; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 518ea9456eba..6b8bc276d461 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -52,6 +52,41 @@ extern const struct xfs_btree_ops xfs_bmbt_ops; extern const struct xfs_btree_ops xfs_refcountbt_ops; extern const struct xfs_btree_ops xfs_rmapbt_ops; +static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_bnobt_ops; +} + +static inline bool xfs_btree_is_cnt(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_cntbt_ops; +} + +static inline bool xfs_btree_is_bmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_bmbt_ops; +} + +static inline bool xfs_btree_is_ino(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_inobt_ops; +} + +static inline bool xfs_btree_is_fino(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_finobt_ops; +} + +static inline bool xfs_btree_is_refcount(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_refcountbt_ops; +} + +static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rmapbt_ops; +} + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index f577247b748d..76eb9e328835 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -116,15 +116,6 @@ typedef enum { { XFS_LOOKUP_LEi, "le" }, \ { XFS_LOOKUP_GEi, "ge" } -/* - * This enum is used in string mapping in xfs_trace.h and scrub/trace.h; - * please keep the TRACE_DEFINE_ENUMs for it up to date. - */ -typedef enum { - XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi, - XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX -} xfs_btnum_t; - struct xfs_name { const unsigned char *name; int len; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 1ec3339755b9..187d692a0b58 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -374,14 +374,12 @@ xchk_btree_check_block_owner( { xfs_agnumber_t agno; xfs_agblock_t agbno; - xfs_btnum_t btnum; bool init_sa; int error = 0; if (!bs->cur) return 0; - btnum = bs->cur->bc_btnum; agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); @@ -404,11 +402,11 @@ xchk_btree_check_block_owner( * have to nullify it (to shut down further block owner checks) if * self-xref encounters problems. */ - if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO) + if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops)) bs->cur = NULL; xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo); - if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) + if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops)) bs->cur = NULL; out_free: @@ -447,7 +445,7 @@ xchk_btree_check_owner( * duplicate cursors. Therefore, save the buffer daddr for * later scanning. */ - if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { + if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) { struct check_owner *co; co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS); @@ -480,7 +478,7 @@ xchk_btree_check_iroot_minrecs( * existing filesystems, so instead we disable the check for data fork * bmap btrees when there's an attr fork. */ - if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && + if (xfs_btree_is_bmap(bs->cur->bc_ops) && bs->cur->bc_ino.whichfork == XFS_DATA_FORK && xfs_inode_has_attr_fork(bs->sc->ip)) return false; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 26d589e9ba1c..750d7b0cd25a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -76,7 +76,7 @@ xchk_inobt_xref_finobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_FINO); + ASSERT(xfs_btree_is_fino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -179,7 +179,7 @@ xchk_finobt_xref_inobt( int has_record; int error; - ASSERT(cur->bc_btnum == XFS_BTNUM_INO); + ASSERT(xfs_btree_is_ino(cur->bc_ops)); error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record); if (error) @@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment( * Otherwise, we expect that the finobt record is aligned to the * cluster alignment as told by the superblock. */ - if (bs->cur->bc_btnum == XFS_BTNUM_FINO) { + if (xfs_btree_is_fino(bs->cur->bc_ops)) { unsigned int imask; imask = min_t(unsigned int, XFS_INODES_PER_CHUNK, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 2c2f99d8772c..b840f25c03d6 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -32,14 +32,6 @@ struct xchk_fscounters; * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); -TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 9921b5d3f158..b28546b6fe34 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -532,7 +532,7 @@ xfs_btree_mark_sick( xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask); return; case XFS_BTREE_TYPE_INODE: - if (cur->bc_btnum == XFS_BTNUM_BMAP) { + if (xfs_btree_is_bmap(cur->bc_ops)) { xfs_bmap_mark_sick(cur->bc_ino.ip, cur->bc_ino.whichfork); return; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 498b8922062a..e876a47f1427 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2450,15 +2450,6 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall); DEFINE_DISCARD_EVENT(xfs_discard_exclude); DEFINE_DISCARD_EVENT(xfs_discard_busy); -/* btree cursor events */ -TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi); -TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_INOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi); -TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi); -TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi); - DECLARE_EVENT_CLASS(xfs_btree_cur_class, TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), TP_ARGS(cur, level, bp), -- cgit v1.2.3 From 4bc94bf640e08cf970354036683ec143a7ae974e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:52 -0800 Subject: xfs: simplify xfs_btree_check_sblock_siblings Stop using xfs_btree_check_sptr in xfs_btree_check_sblock_siblings, as it only duplicates the xfs_verify_agbno call in the other leg of if / else besides adding a tautological level check. With this the cur and level arguments can be removed as they are now unused. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 769be61ad63f..cea5500a0ece 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -86,8 +86,6 @@ xfs_btree_check_lblock_siblings( static inline xfs_failaddr_t xfs_btree_check_sblock_siblings( struct xfs_perag *pag, - struct xfs_btree_cur *cur, - int level, xfs_agblock_t agbno, __be32 dsibling) { @@ -99,13 +97,8 @@ xfs_btree_check_sblock_siblings( sibling = be32_to_cpu(dsibling); if (sibling == agbno) return __this_address; - if (level >= 0) { - if (!xfs_btree_check_sptr(cur, sibling, level + 1)) - return __this_address; - } else { - if (!xfs_verify_agbno(pag, sibling)) - return __this_address; - } + if (!xfs_verify_agbno(pag, sibling)) + return __this_address; return NULL; } @@ -212,10 +205,10 @@ __xfs_btree_check_sblock( if (bp) agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + fa = xfs_btree_check_sblock_siblings(pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + fa = xfs_btree_check_sblock_siblings(pag, agbno, block->bb_u.s.bb_rightsib); return fa; } @@ -4713,10 +4706,10 @@ xfs_btree_sblock_verify( /* sibling pointer verification */ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + fa = xfs_btree_check_sblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, + fa = xfs_btree_check_sblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_rightsib); return fa; } -- cgit v1.2.3 From 8b8ada973cacff338a0e817a97dd0afa301798c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:53 -0800 Subject: xfs: simplify xfs_btree_check_lblock_siblings Stop using xfs_btree_check_lptr in xfs_btree_check_lblock_siblings, as it only duplicates the xfs_verify_fsbno call in the other leg of if / else besides adding a tautological level check. With this the cur and level arguments can be removed as they are now unused. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index cea5500a0ece..fc877188919e 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -59,8 +59,6 @@ xfs_btree_magic( static inline xfs_failaddr_t xfs_btree_check_lblock_siblings( struct xfs_mount *mp, - struct xfs_btree_cur *cur, - int level, xfs_fsblock_t fsb, __be64 dsibling) { @@ -72,14 +70,8 @@ xfs_btree_check_lblock_siblings( sibling = be64_to_cpu(dsibling); if (sibling == fsb) return __this_address; - if (level >= 0) { - if (!xfs_btree_check_lptr(cur, sibling, level + 1)) - return __this_address; - } else { - if (!xfs_verify_fsbno(mp, sibling)) - return __this_address; - } - + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; return NULL; } @@ -139,10 +131,9 @@ __xfs_btree_check_lblock( if (bp) fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, - block->bb_u.l.bb_leftsib); + fa = xfs_btree_check_lblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + fa = xfs_btree_check_lblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } @@ -4651,10 +4642,9 @@ xfs_btree_lblock_verify( /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, - block->bb_u.l.bb_leftsib); + fa = xfs_btree_check_lblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + fa = xfs_btree_check_lblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } -- cgit v1.2.3 From fb0793f206701a68f8588a09bf32f7cf44878ea3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:53 -0800 Subject: xfs: open code xfs_btree_check_lptr in xfs_bmap_btree_to_extents xfs_bmap_btree_to_extents always passes a level of 1 to xfs_btree_check_lptr, thus making the level check redundant. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6a374a6e8586..5a39fdb5178b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -568,7 +568,7 @@ xfs_bmap_btree_to_extents( pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); cbno = be64_to_cpu(*pp); #ifdef DEBUG - if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1))) { + if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) { xfs_btree_mark_sick(cur); return -EFSCORRUPTED; } -- cgit v1.2.3 From 57982d6c835a71da5c66e6090680de1adf6e736a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:54 -0800 Subject: xfs: consolidate btree ptr checking Merge xfs_btree_check_sptr and xfs_btree_check_lptr into a single __xfs_btree_check_ptr that can be shared between xfs_btree_check_ptr and the scrub code. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 60 +++++++++++++++++++++++------------------------ fs/xfs/libxfs/xfs_btree.h | 21 +++-------------- fs/xfs/scrub/btree.c | 12 ++++------ 3 files changed, 36 insertions(+), 57 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index fc877188919e..1a0816aa5009 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -242,28 +242,27 @@ xfs_btree_check_block( return xfs_btree_check_sblock(cur, block, level, bp); } -/* Check that this long pointer is valid and points within the fs. */ -bool -xfs_btree_check_lptr( - struct xfs_btree_cur *cur, - xfs_fsblock_t fsbno, - int level) +int +__xfs_btree_check_ptr( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int index, + int level) { if (level <= 0) - return false; - return xfs_verify_fsbno(cur->bc_mp, fsbno); -} + return -EFSCORRUPTED; -/* Check that this short pointer is valid and points within the AG. */ -bool -xfs_btree_check_sptr( - struct xfs_btree_cur *cur, - xfs_agblock_t agbno, - int level) -{ - if (level <= 0) - return false; - return xfs_verify_agbno(cur->bc_ag.pag, agbno); + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + if (!xfs_verify_fsbno(cur->bc_mp, + be64_to_cpu((&ptr->l)[index]))) + return -EFSCORRUPTED; + } else { + if (!xfs_verify_agbno(cur->bc_ag.pag, + be32_to_cpu((&ptr->s)[index]))) + return -EFSCORRUPTED; + } + + return 0; } /* @@ -277,27 +276,26 @@ xfs_btree_check_ptr( int index, int level) { - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { - if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), - level)) - return 0; - xfs_err(cur->bc_mp, + int error; + + error = __xfs_btree_check_ptr(cur, ptr, index, level); + if (error) { + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork, cur->bc_ops->name, level, index); - } else { - if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]), - level)) - return 0; - xfs_err(cur->bc_mp, + } else { + xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", cur->bc_ag.pag->pag_agno, cur->bc_ops->name, level, index); + } + xfs_btree_mark_sick(cur); } - xfs_btree_mark_sick(cur); - return -EFSCORRUPTED; + return error; } #ifdef DEBUG diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9a264ffee303..ca4a305eb071 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -343,6 +343,9 @@ xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur, xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); +int __xfs_btree_check_ptr(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int index, int level); + /* * Check that block header is ok. */ @@ -353,24 +356,6 @@ xfs_btree_check_block( int level, /* level of the btree block */ struct xfs_buf *bp); /* buffer containing block, if any */ -/* - * Check that (long) pointer is ok. - */ -bool /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_fsblock_t fsbno, /* btree block disk address */ - int level); /* btree block level */ - -/* - * Check that (short) pointer is ok. - */ -bool /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sptr( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agblock_t agbno, /* btree block disk address */ - int level); /* btree block level */ - /* * Delete the btree cursor. */ diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 187d692a0b58..6fe5dae06f23 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -236,22 +236,18 @@ xchk_btree_ptr_ok( int level, union xfs_btree_ptr *ptr) { - bool res; - /* A btree rooted in an inode has no block pointer to the root. */ if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE && level == bs->cur->bc_nlevels) return true; /* Otherwise, check the pointers. */ - if (bs->cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level); - else - res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level); - if (!res) + if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); + return false; + } - return res; + return true; } /* Check that a btree block's sibling matches what we expect it. */ -- cgit v1.2.3 From 43be09192ce1f3cf9c3d2073e822a1d0a42fe5b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:55 -0800 Subject: xfs: misc cleanups for __xfs_btree_check_sblock Remove the local crc variable that is only used once and remove the bp NULL checking as it can't ever be NULL for short form blocks. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 1a0816aa5009..255d5437b6e2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -173,15 +173,13 @@ __xfs_btree_check_sblock( { struct xfs_mount *mp = cur->bc_mp; struct xfs_perag *pag = cur->bc_ag.pag; - bool crc = xfs_has_crc(mp); xfs_failaddr_t fa; - xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agblock_t agbno; - if (crc) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.s.bb_blkno != - cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) + if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; } @@ -193,9 +191,7 @@ __xfs_btree_check_sblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) - agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_sblock_siblings(pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) -- cgit v1.2.3 From bd45019d9aa942d1c2457d96a7dbf2ad3051754b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:56 -0800 Subject: xfs: remove the crc variable in __xfs_btree_check_lblock crc is only used once, just use the xfs_has_crc check directly. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 255d5437b6e2..4a6bc66c14d0 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -106,11 +106,10 @@ __xfs_btree_check_lblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - bool crc = xfs_has_crc(mp); xfs_failaddr_t fa; xfs_fsblock_t fsb = NULLFSBLOCK; - if (crc) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != -- cgit v1.2.3 From d477f1749f00899c71605ea01aba0ce67e030471 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:57 -0800 Subject: xfs: tighten up validation of root block in inode forks Check that root blocks that sit in the inode fork and thus have a NULL bp don't have siblings. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 4a6bc66c14d0..f5d7c4236963 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -107,7 +107,7 @@ __xfs_btree_check_lblock( { struct xfs_mount *mp = cur->bc_mp; xfs_failaddr_t fa; - xfs_fsblock_t fsb = NULLFSBLOCK; + xfs_fsblock_t fsb; if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -127,9 +127,19 @@ __xfs_btree_check_lblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) - fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + /* + * For inode-rooted btrees, the root block sits in the inode fork. In + * that case bp is NULL, and the block must not have any siblings. + */ + if (!bp) { + if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK)) + return __this_address; + if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK)) + return __this_address; + return NULL; + } + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); fa = xfs_btree_check_lblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); if (!fa) fa = xfs_btree_check_lblock_siblings(mp, fsb, -- cgit v1.2.3 From 4ce0c711d9ab3a435bc605cd2f36a3f6b4e12c05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:57 -0800 Subject: xfs: consolidate btree block verification Add a __xfs_btree_check_block helper that can be called by the scrub code to validate a btree block of any form, and move the duplicate error handling code from xfs_btree_check_sblock and xfs_btree_check_lblock into xfs_btree_check_block and thus remove these two helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 72 ++++++++++++++++++++--------------------------- fs/xfs/libxfs/xfs_btree.h | 9 +----- fs/xfs/scrub/btree.c | 9 +----- 3 files changed, 32 insertions(+), 58 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f5d7c4236963..c7bd05ffc4ae 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -98,7 +98,7 @@ xfs_btree_check_sblock_siblings( * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. */ -xfs_failaddr_t +static xfs_failaddr_t __xfs_btree_check_lblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -147,33 +147,11 @@ __xfs_btree_check_lblock( return fa; } -/* Check a long btree block header. */ -static int -xfs_btree_check_lblock( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - int level, - struct xfs_buf *bp) -{ - struct xfs_mount *mp = cur->bc_mp; - xfs_failaddr_t fa; - - fa = __xfs_btree_check_lblock(cur, block, level, bp); - if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) { - if (bp) - trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_btree_mark_sick(cur); - return -EFSCORRUPTED; - } - return 0; -} - /* * Check a short btree block header. Return the address of the failing check, * or NULL if everything is ok. */ -xfs_failaddr_t +static xfs_failaddr_t __xfs_btree_check_sblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, @@ -209,26 +187,28 @@ __xfs_btree_check_sblock( return fa; } -/* Check a short btree block header. */ -STATIC int -xfs_btree_check_sblock( +/* + * Internal btree block check. + * + * Return NULL if the block is ok or the address of the failed check otherwise. + */ +xfs_failaddr_t +__xfs_btree_check_block( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { - struct xfs_mount *mp = cur->bc_mp; - xfs_failaddr_t fa; + if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) + return __xfs_btree_check_sblock(cur, block, level, bp); + return __xfs_btree_check_lblock(cur, block, level, bp); +} - fa = __xfs_btree_check_sblock(cur, block, level, bp); - if (XFS_IS_CORRUPT(mp, fa != NULL) || - XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) { - if (bp) - trace_xfs_btree_corrupt(bp, _RET_IP_); - xfs_btree_mark_sick(cur); - return -EFSCORRUPTED; - } - return 0; +static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) + return XFS_ERRTAG_BTREE_CHECK_SBLOCK; + return XFS_ERRTAG_BTREE_CHECK_LBLOCK; } /* @@ -241,10 +221,18 @@ xfs_btree_check_block( int level, /* level of the btree block */ struct xfs_buf *bp) /* buffer containing block, if any */ { - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - return xfs_btree_check_lblock(cur, block, level, bp); - else - return xfs_btree_check_sblock(cur, block, level, bp); + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + + fa = __xfs_btree_check_block(cur, block, level, bp); + if (XFS_IS_CORRUPT(mp, fa != NULL) || + XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; + } + return 0; } int diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index ca4a305eb071..d3afa6209ff8 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -334,15 +334,8 @@ xfs_btree_cur_sizeof(unsigned int nlevels) */ #define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) -/* - * Internal long and short btree block checks. They return NULL if the - * block is ok or the address of the failed check otherwise. - */ -xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur, +xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp); -xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, int level, struct xfs_buf *bp); - int __xfs_btree_check_ptr(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, int index, int level); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 6fe5dae06f23..fe678a0438bc 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -584,7 +584,6 @@ xchk_btree_get_block( struct xfs_btree_block **pblock, struct xfs_buf **pbp) { - xfs_failaddr_t failed_at; int error; *pblock = NULL; @@ -596,13 +595,7 @@ xchk_btree_get_block( return error; xfs_btree_get_block(bs->cur, level, pbp); - if (bs->cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - failed_at = __xfs_btree_check_lblock(bs->cur, *pblock, - level, *pbp); - else - failed_at = __xfs_btree_check_sblock(bs->cur, *pblock, - level, *pbp); - if (failed_at) { + if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) { xchk_btree_set_corrupt(bs->sc, bs->cur, level); return 0; } -- cgit v1.2.3 From 5ef819c34f954fccfc42f79b9b0bea9b40cef9a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:58 -0800 Subject: xfs: rename btree helpers that depends on the block number representation All these helpers hardcode fsblocks or agblocks and not just the pointer size. Rename them so that the names are still fitting when we add the long format in-memory blocks and adjust the checks when calling them to check the btree types and not just pointer length. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 8 ++--- fs/xfs/libxfs/xfs_bmap_btree.c | 8 ++--- fs/xfs/libxfs/xfs_btree.c | 64 ++++++++++++++++++++------------------ fs/xfs/libxfs/xfs_btree.h | 16 +++++----- fs/xfs/libxfs/xfs_ialloc_btree.c | 8 ++--- fs/xfs/libxfs/xfs_refcount_btree.c | 8 ++--- fs/xfs/libxfs/xfs_rmap_btree.c | 8 ++--- 7 files changed, 61 insertions(+), 59 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 885c7db5d6e7..6ef5ddd89600 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -322,7 +322,7 @@ xfs_allocbt_verify( return __this_address; if (xfs_has_crc(mp)) { - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; } @@ -362,7 +362,7 @@ xfs_allocbt_verify( } else if (level >= mp->m_alloc_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_alloc_mxr[level != 0]); } static void @@ -371,7 +371,7 @@ xfs_allocbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_allocbt_verify(bp); @@ -395,7 +395,7 @@ xfs_allocbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 54fdf0df8ec3..f5d84dcb58da 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -420,7 +420,7 @@ xfs_bmbt_verify( * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. */ - fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); if (fa) return fa; } @@ -436,7 +436,7 @@ xfs_bmbt_verify( if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) return __this_address; - return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]); + return xfs_btree_fsblock_verify(bp, mp->m_bmap_dmxr[level != 0]); } static void @@ -445,7 +445,7 @@ xfs_bmbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_lblock_verify_crc(bp)) + if (!xfs_btree_fsblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_bmbt_verify(bp); @@ -469,7 +469,7 @@ xfs_bmbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_lblock_calc_crc(bp); + xfs_btree_fsblock_calc_crc(bp); } const struct xfs_buf_ops xfs_bmbt_buf_ops = { diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c7bd05ffc4ae..96c1c6968988 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -57,7 +57,7 @@ xfs_btree_magic( * bytes. */ static inline xfs_failaddr_t -xfs_btree_check_lblock_siblings( +xfs_btree_check_fsblock_siblings( struct xfs_mount *mp, xfs_fsblock_t fsb, __be64 dsibling) @@ -76,7 +76,7 @@ xfs_btree_check_lblock_siblings( } static inline xfs_failaddr_t -xfs_btree_check_sblock_siblings( +xfs_btree_check_agblock_siblings( struct xfs_perag *pag, xfs_agblock_t agbno, __be32 dsibling) @@ -99,7 +99,7 @@ xfs_btree_check_sblock_siblings( * or NULL if everything is ok. */ static xfs_failaddr_t -__xfs_btree_check_lblock( +__xfs_btree_check_fsblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, @@ -140,9 +140,10 @@ __xfs_btree_check_lblock( } fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_lblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); + fa = xfs_btree_check_fsblock_siblings(mp, fsb, + block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } @@ -152,7 +153,7 @@ __xfs_btree_check_lblock( * or NULL if everything is ok. */ static xfs_failaddr_t -__xfs_btree_check_sblock( +__xfs_btree_check_agblock( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, @@ -179,10 +180,10 @@ __xfs_btree_check_sblock( return __this_address; agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(pag, agbno, + fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(pag, agbno, + fa = xfs_btree_check_agblock_siblings(pag, agbno, block->bb_u.s.bb_rightsib); return fa; } @@ -199,9 +200,9 @@ __xfs_btree_check_block( int level, struct xfs_buf *bp) { - if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN) - return __xfs_btree_check_sblock(cur, block, level, bp); - return __xfs_btree_check_lblock(cur, block, level, bp); + if (cur->bc_ops->type == XFS_BTREE_TYPE_AG) + return __xfs_btree_check_agblock(cur, block, level, bp); + return __xfs_btree_check_fsblock(cur, block, level, bp); } static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) @@ -245,7 +246,7 @@ __xfs_btree_check_ptr( if (level <= 0) return -EFSCORRUPTED; - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { if (!xfs_verify_fsbno(cur->bc_mp, be64_to_cpu((&ptr->l)[index]))) return -EFSCORRUPTED; @@ -273,7 +274,7 @@ xfs_btree_check_ptr( error = __xfs_btree_check_ptr(cur, ptr, index, level); if (error) { - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { + if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, @@ -306,7 +307,7 @@ xfs_btree_check_ptr( * it to disk. */ void -xfs_btree_lblock_calc_crc( +xfs_btree_fsblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -320,7 +321,7 @@ xfs_btree_lblock_calc_crc( } bool -xfs_btree_lblock_verify_crc( +xfs_btree_fsblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -344,7 +345,7 @@ xfs_btree_lblock_verify_crc( * it to disk. */ void -xfs_btree_sblock_calc_crc( +xfs_btree_agblock_calc_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -358,7 +359,7 @@ xfs_btree_sblock_calc_crc( } bool -xfs_btree_sblock_verify_crc( +xfs_btree_agblock_verify_crc( struct xfs_buf *bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -913,7 +914,7 @@ xfs_btree_reada_bufs( } STATIC int -xfs_btree_readahead_lblock( +xfs_btree_readahead_fsblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) @@ -938,7 +939,7 @@ xfs_btree_readahead_lblock( } STATIC int -xfs_btree_readahead_sblock( +xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, int lr, struct xfs_btree_block *block) @@ -989,8 +990,8 @@ xfs_btree_readahead( block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - return xfs_btree_readahead_lblock(cur, lr, block); - return xfs_btree_readahead_sblock(cur, lr, block); + return xfs_btree_readahead_fsblock(cur, lr, block); + return xfs_btree_readahead_agblock(cur, lr, block); } STATIC int @@ -4597,7 +4598,7 @@ xfs_btree_change_owner( /* Verify the v5 fields of a long-format btree block. */ xfs_failaddr_t -xfs_btree_lblock_v5hdr_verify( +xfs_btree_fsblock_v5hdr_verify( struct xfs_buf *bp, uint64_t owner) { @@ -4618,7 +4619,7 @@ xfs_btree_lblock_v5hdr_verify( /* Verify a long-format btree block. */ xfs_failaddr_t -xfs_btree_lblock_verify( +xfs_btree_fsblock_verify( struct xfs_buf *bp, unsigned int max_recs) { @@ -4633,21 +4634,22 @@ xfs_btree_lblock_verify( /* sibling pointer verification */ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_lblock_siblings(mp, fsb, block->bb_u.l.bb_leftsib); + fa = xfs_btree_check_fsblock_siblings(mp, fsb, + block->bb_u.l.bb_leftsib); if (!fa) - fa = xfs_btree_check_lblock_siblings(mp, fsb, + fa = xfs_btree_check_fsblock_siblings(mp, fsb, block->bb_u.l.bb_rightsib); return fa; } /** - * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format + * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block * * @bp: buffer containing the btree block */ xfs_failaddr_t -xfs_btree_sblock_v5hdr_verify( +xfs_btree_agblock_v5hdr_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_mount; @@ -4666,13 +4668,13 @@ xfs_btree_sblock_v5hdr_verify( } /** - * xfs_btree_sblock_verify() -- verify a short-format btree block + * xfs_btree_agblock_verify() -- verify a short-format btree block * * @bp: buffer containing the btree block * @max_recs: maximum records allowed in this btree node */ xfs_failaddr_t -xfs_btree_sblock_verify( +xfs_btree_agblock_verify( struct xfs_buf *bp, unsigned int max_recs) { @@ -4687,10 +4689,10 @@ xfs_btree_sblock_verify( /* sibling pointer verification */ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(bp->b_pag, agbno, + fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(bp->b_pag, agbno, + fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno, block->bb_u.s.bb_rightsib); return fa; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index d3afa6209ff8..b9b46a573e64 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -441,10 +441,10 @@ int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner, /* * btree block CRC helpers */ -void xfs_btree_lblock_calc_crc(struct xfs_buf *); -bool xfs_btree_lblock_verify_crc(struct xfs_buf *); -void xfs_btree_sblock_calc_crc(struct xfs_buf *); -bool xfs_btree_sblock_verify_crc(struct xfs_buf *); +void xfs_btree_fsblock_calc_crc(struct xfs_buf *); +bool xfs_btree_fsblock_verify_crc(struct xfs_buf *); +void xfs_btree_agblock_calc_crc(struct xfs_buf *); +bool xfs_btree_agblock_verify_crc(struct xfs_buf *); /* * Internal btree helpers also used by xfs_bmap.c. @@ -484,12 +484,12 @@ static inline int xfs_btree_get_level(const struct xfs_btree_block *block) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) -xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); -xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp); +xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp, unsigned int max_recs); -xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); -xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, +xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 74f144b2db68..cc661fca6ff5 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -309,7 +309,7 @@ xfs_inobt_verify( * xfs_perag_initialised_agi(pag)) if we ever do. */ if (xfs_has_crc(mp)) { - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; } @@ -319,7 +319,7 @@ xfs_inobt_verify( if (level >= M_IGEO(mp)->inobt_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, + return xfs_btree_agblock_verify(bp, M_IGEO(mp)->inobt_mxr[level != 0]); } @@ -329,7 +329,7 @@ xfs_inobt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_inobt_verify(bp); @@ -353,7 +353,7 @@ xfs_inobt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index f93dae3db701..ca59f6c89f3e 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -217,7 +217,7 @@ xfs_refcountbt_verify( if (!xfs_has_reflink(mp)) return __this_address; - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; @@ -239,7 +239,7 @@ xfs_refcountbt_verify( } else if (level >= mp->m_refc_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]); } STATIC void @@ -248,7 +248,7 @@ xfs_refcountbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_refcountbt_verify(bp); @@ -272,7 +272,7 @@ xfs_refcountbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index b1ecc061fdc9..0751268c102c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -336,7 +336,7 @@ xfs_rmapbt_verify( if (!xfs_has_rmapbt(mp)) return __this_address; - fa = xfs_btree_sblock_v5hdr_verify(bp); + fa = xfs_btree_agblock_v5hdr_verify(bp); if (fa) return fa; @@ -347,7 +347,7 @@ xfs_rmapbt_verify( } else if (level >= mp->m_rmap_maxlevels) return __this_address; - return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); + return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]); } static void @@ -356,7 +356,7 @@ xfs_rmapbt_read_verify( { xfs_failaddr_t fa; - if (!xfs_btree_sblock_verify_crc(bp)) + if (!xfs_btree_agblock_verify_crc(bp)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { fa = xfs_rmapbt_verify(bp); @@ -380,7 +380,7 @@ xfs_rmapbt_write_verify( xfs_verifier_error(bp, -EFSCORRUPTED, fa); return; } - xfs_btree_sblock_calc_crc(bp); + xfs_btree_agblock_calc_crc(bp); } -- cgit v1.2.3 From 79e72304dcba471e5c0dea2f3c67fe1a0558c140 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:40:59 -0800 Subject: xfs: factor out a __xfs_btree_check_lblock_hdr helper This will allow sharing code with the in-memory block checking helper. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 96c1c6968988..b228b22893fa 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -94,20 +94,14 @@ xfs_btree_check_agblock_siblings( return NULL; } -/* - * Check a long btree block header. Return the address of the failing check, - * or NULL if everything is ok. - */ static xfs_failaddr_t -__xfs_btree_check_fsblock( +__xfs_btree_check_lblock_hdr( struct xfs_btree_cur *cur, struct xfs_btree_block *block, int level, struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; - xfs_failaddr_t fa; - xfs_fsblock_t fsb; if (xfs_has_crc(mp)) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -127,6 +121,28 @@ __xfs_btree_check_fsblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; + return NULL; +} + +/* + * Check a long btree block header. Return the address of the failing check, + * or NULL if everything is ok. + */ +static xfs_failaddr_t +__xfs_btree_check_fsblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_failaddr_t fa; + xfs_fsblock_t fsb; + + fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); + if (fa) + return fa; + /* * For inode-rooted btrees, the root block sits in the inode fork. In * that case bp is NULL, and the block must not have any siblings. -- cgit v1.2.3 From 5eec8fa30dfa548d07332756101053f47f6ba26c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:41:00 -0800 Subject: xfs: remove xfs_btree_reada_bufl xfs_btree_reada_bufl just wraps xfs_btree_readahead and a fsblock to daddr conversion. Just open code it's two callsites in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 30 ++++++------------------------ fs/xfs/libxfs/xfs_btree.h | 11 ----------- 2 files changed, 6 insertions(+), 35 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b228b22893fa..2689cad9b25a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -889,25 +889,6 @@ xfs_btree_read_bufl( return 0; } -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Long-form addressing. - */ -/* ARGSUSED */ -void -xfs_btree_reada_bufl( - struct xfs_mount *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops) -{ - xfs_daddr_t d; - - ASSERT(fsbno != NULLFSBLOCK); - d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); -} - /* * Read-ahead the block, don't wait for it, don't return a buffer. * Short-form addressing. @@ -935,19 +916,20 @@ xfs_btree_readahead_fsblock( int lr, struct xfs_btree_block *block) { - int rval = 0; + struct xfs_mount *mp = cur->bc_mp; xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { - xfs_btree_reada_bufl(cur->bc_mp, left, 1, - cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left), + mp->m_bsize, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { - xfs_btree_reada_bufl(cur->bc_mp, right, 1, - cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right), + mp->m_bsize, cur->bc_ops->buf_ops); rval++; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index b9b46a573e64..001ff9392804 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -391,17 +391,6 @@ xfs_btree_read_bufl( int refval, /* ref count value for buffer */ const struct xfs_buf_ops *ops); -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Long-form addressing. - */ -void /* error */ -xfs_btree_reada_bufl( - struct xfs_mount *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops); - /* * Read-ahead the block, don't wait for it, don't return a buffer. * Short-form addressing. -- cgit v1.2.3 From 6324b00c9ecb8d11a157d2a4bc3e5a495534bdf1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:41:01 -0800 Subject: xfs: remove xfs_btree_reada_bufs xfs_btree_reada_bufl just wraps xfs_btree_readahead and a agblock to daddr conversion. Just open code it's three callsites in the two callers (One of which isn't even btree related). Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 38 ++++++++++---------------------------- fs/xfs/libxfs/xfs_btree.h | 12 ------------ fs/xfs/xfs_iwalk.c | 6 ++++-- 3 files changed, 14 insertions(+), 42 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2689cad9b25a..0f06ee026d9a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -889,27 +889,6 @@ xfs_btree_read_bufl( return 0; } -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Short-form addressing. - */ -/* ARGSUSED */ -void -xfs_btree_reada_bufs( - struct xfs_mount *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops) -{ - xfs_daddr_t d; - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); -} - STATIC int xfs_btree_readahead_fsblock( struct xfs_btree_cur *cur, @@ -940,22 +919,25 @@ STATIC int xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, int lr, - struct xfs_btree_block *block) + struct xfs_btree_block *block) { - int rval = 0; + struct xfs_mount *mp = cur->bc_mp; + xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); - + int rval = 0; if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, - left, 1, cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, left), + mp->m_bsize, cur->bc_ops->buf_ops); rval++; } if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno, - right, 1, cur->bc_ops->buf_ops); + xfs_buf_readahead(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, right), + mp->m_bsize, cur->bc_ops->buf_ops); rval++; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 001ff9392804..c48b4fdebafa 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -391,18 +391,6 @@ xfs_btree_read_bufl( int refval, /* ref count value for buffer */ const struct xfs_buf_ops *ops); -/* - * Read-ahead the block, don't wait for it, don't return a buffer. - * Short-form addressing. - */ -void /* error */ -xfs_btree_reada_bufs( - struct xfs_mount *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count, /* count of filesystem blocks */ - const struct xfs_buf_ops *ops); - /* * Initialise a new btree block header */ diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index b6a7751e7c36..01b55f03a102 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -100,6 +100,7 @@ xfs_iwalk_ichunk_ra( struct xfs_inobt_rec_incore *irec) { struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agnumber_t agno = pag->pag_agno; xfs_agblock_t agbno; struct blk_plug plug; int i; /* inode chunk index */ @@ -112,8 +113,9 @@ xfs_iwalk_ichunk_ra( imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); if (imask & ~irec->ir_free) { - xfs_btree_reada_bufs(mp, pag->pag_agno, agbno, - igeo->blocks_per_cluster, + xfs_buf_readahead(mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, agbno), + igeo->blocks_per_cluster * mp->m_bsize, &xfs_inode_buf_ops); } agbno += igeo->blocks_per_cluster; -- cgit v1.2.3 From 6a701eb8fbbb5f500684947883fd77ed0475fa82 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:41:01 -0800 Subject: xfs: move and rename xfs_btree_read_bufl Despite its name, xfs_btree_read_bufl doesn't contain any btree-related functionaliy and isn't used by the btree code. Move it to xfs_bmap.c, hard code the refval and ops arguments and rename it to xfs_bmap_read_buf. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 33 +++++++++++++++++++++++++-------- fs/xfs/libxfs/xfs_btree.c | 30 ------------------------------ fs/xfs/libxfs/xfs_btree.h | 13 ------------- 3 files changed, 25 insertions(+), 51 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 5a39fdb5178b..2e0546eb8010 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -226,6 +226,28 @@ xfs_bmap_forkoff_reset( } } +static int +xfs_bmap_read_buf( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + struct xfs_buf **bpp) /* buffer for fsbno */ +{ + struct xfs_buf *bp; /* return value */ + int error; + + if (!xfs_verify_fsbno(mp, fsbno)) + return -EFSCORRUPTED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp, + &xfs_bmbt_buf_ops); + if (!error) { + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); + *bpp = bp; + } + return error; +} + #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -365,9 +387,7 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, NULL, bno, &bp); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) @@ -454,9 +474,7 @@ xfs_bmap_check_leaf_extents( bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); if (!bp) { bp_release = 1; - error = xfs_btree_read_bufl(mp, NULL, bno, &bp, - XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, NULL, bno, &bp); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) @@ -573,8 +591,7 @@ xfs_bmap_btree_to_extents( return -EFSCORRUPTED; } #endif - error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF, - &xfs_bmbt_buf_ops); + error = xfs_bmap_read_buf(mp, tp, cbno, &cbp); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0f06ee026d9a..ecdd5ea27a03 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -859,36 +859,6 @@ xfs_btree_offsets( } } -/* - * Get a buffer for the block, return it read in. - * Long-form addressing. - */ -int -xfs_btree_read_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - struct xfs_buf **bpp, /* buffer for fsbno */ - int refval, /* ref count value for buffer */ - const struct xfs_buf_ops *ops) -{ - struct xfs_buf *bp; /* return value */ - xfs_daddr_t d; /* real disk block address */ - int error; - - if (!xfs_verify_fsbno(mp, fsbno)) - return -EFSCORRUPTED; - d = XFS_FSB_TO_DADDR(mp, fsbno); - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, 0, &bp, ops); - if (error) - return error; - if (bp) - xfs_buf_set_ref(bp, refval); - *bpp = bp; - return 0; -} - STATIC int xfs_btree_readahead_fsblock( struct xfs_btree_cur *cur, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c48b4fdebafa..bacd67cc8ced 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -378,19 +378,6 @@ xfs_btree_offsets( int *first, /* output: first byte offset */ int *last); /* output: last byte offset */ -/* - * Get a buffer for the block, return it read in. - * Long-form addressing. - */ -int /* error */ -xfs_btree_read_bufl( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - struct xfs_buf **bpp, /* buffer for fsbno */ - int refval, /* ref count value for buffer */ - const struct xfs_buf_ops *ops); - /* * Initialise a new btree block header */ -- cgit v1.2.3 From 24f755e4854e0fddb78d18f610bf1b5cb61db520 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:41:02 -0800 Subject: xfs: split xfs_buf_rele for cached vs uncached buffers xfs_buf_rele is a bit confusing because it mixes up handling of normal cached and the special uncached buffers without much explanation. Split the handling into two different helpers, and use a clearly named helper that checks the hash key to distinguish the two cases instead of checking the pag pointer. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index fb80cae0b2e5..1e85780411d5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -60,6 +60,11 @@ xfs_buf_submit( return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); } +static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) +{ + return bp->b_rhash_key == XFS_BUF_DADDR_NULL; +} + static inline int xfs_buf_is_vmapped( struct xfs_buf *bp) @@ -996,12 +1001,19 @@ xfs_buf_hold( atomic_inc(&bp->b_hold); } -/* - * Release a hold on the specified buffer. If the hold count is 1, the buffer is - * placed on LRU or freed (depending on b_lru_ref). - */ -void -xfs_buf_rele( +static void +xfs_buf_rele_uncached( + struct xfs_buf *bp) +{ + ASSERT(list_empty(&bp->b_lru)); + if (atomic_dec_and_test(&bp->b_hold)) { + xfs_buf_ioacct_dec(bp); + xfs_buf_free(bp); + } +} + +static void +xfs_buf_rele_cached( struct xfs_buf *bp) { struct xfs_perag *pag = bp->b_pag; @@ -1010,15 +1022,6 @@ xfs_buf_rele( trace_xfs_buf_rele(bp, _RET_IP_); - if (!pag) { - ASSERT(list_empty(&bp->b_lru)); - if (atomic_dec_and_test(&bp->b_hold)) { - xfs_buf_ioacct_dec(bp); - xfs_buf_free(bp); - } - return; - } - ASSERT(atomic_read(&bp->b_hold) > 0); /* @@ -1086,6 +1089,19 @@ out_unlock: xfs_buf_free(bp); } +/* + * Release a hold on the specified buffer. + */ +void +xfs_buf_rele( + struct xfs_buf *bp) +{ + trace_xfs_buf_rele(bp, _RET_IP_); + if (xfs_buf_is_uncached(bp)) + xfs_buf_rele_uncached(bp); + else + xfs_buf_rele_cached(bp); +} /* * Lock a buffer object, if it is not already locked. -- cgit v1.2.3 From 21e308e6485542a9795144aa6a39a1edc83a0d31 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:42:44 -0800 Subject: xfs: remove the xfs_buftarg_t typedef Switch the few remaining holdouts to the struct version. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 6 +++--- fs/xfs/xfs_buf.h | 4 ++-- fs/xfs/xfs_log.c | 16 ++++++++-------- fs/xfs/xfs_mount.h | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1e85780411d5..1777d834fd54 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1980,7 +1980,7 @@ xfs_free_buftarg( int xfs_setsize_buftarg( - xfs_buftarg_t *btp, + struct xfs_buftarg *btp, unsigned int sectorsize) { /* Set up metadata sector size info */ @@ -2008,7 +2008,7 @@ xfs_setsize_buftarg( */ STATIC int xfs_setsize_buftarg_early( - xfs_buftarg_t *btp) + struct xfs_buftarg *btp) { return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); } @@ -2018,7 +2018,7 @@ xfs_alloc_buftarg( struct xfs_mount *mp, struct bdev_handle *bdev_handle) { - xfs_buftarg_t *btp; + struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b470de08a46c..b9216dee7810 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -96,7 +96,7 @@ typedef unsigned int xfs_buf_flags_t; * The latter is derived from the underlying device, and controls direct IO * alignment constraints. */ -typedef struct xfs_buftarg { +struct xfs_buftarg { dev_t bt_dev; struct bdev_handle *bt_bdev_handle; struct block_device *bt_bdev; @@ -114,7 +114,7 @@ typedef struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; -} xfs_buftarg_t; +}; #define XB_PAGES 2 diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 1f68569e62ca..5004f23d344e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -633,14 +633,14 @@ xlog_state_release_iclog( */ int xfs_log_mount( - xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) -{ - struct xlog *log; - int error = 0; - int min_logfsbs; + xfs_mount_t *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) +{ + struct xlog *log; + int error = 0; + int min_logfsbs; if (!xfs_has_norecovery(mp)) { xfs_notice(mp, "Mounting V%d Filesystem %pU", diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e86dfe67894f..6c44e6db4d86 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -94,9 +94,9 @@ typedef struct xfs_mount { struct xfs_inode *m_rsumip; /* pointer to summary inode */ struct xfs_inode *m_rootip; /* pointer to root directory */ struct xfs_quotainfo *m_quotainfo; /* disk quota information */ - xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ - xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ - xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + struct xfs_buftarg *m_ddev_targp; /* data device */ + struct xfs_buftarg *m_logdev_targp;/* log device */ + struct xfs_buftarg *m_rtdev_targp; /* rt device */ void __percpu *m_inodegc; /* percpu inodegc structures */ /* -- cgit v1.2.3 From 60335cc0fb5c7a8adfc84ba7dc976a00b6133499 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:42:45 -0800 Subject: xfs: remove xfs_setsize_buftarg_early Open code the logic in the only caller, and improve the comment explaining what is being done here. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1777d834fd54..e83cadea239f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2001,18 +2001,6 @@ xfs_setsize_buftarg( return 0; } -/* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used at this early stage. Play safe. - */ -STATIC int -xfs_setsize_buftarg_early( - struct xfs_buftarg *btp) -{ - return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); -} - struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -2033,6 +2021,13 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); + /* + * When allocating the buftargs we have not yet read the super block and + * thus don't know the file system sector size yet. + */ + if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) + goto error_free; + /* * Buffer IO error rate limiting. Limit it to no more than 10 messages * per 30 seconds so as to not spam logs too much on repeated errors. @@ -2040,9 +2035,6 @@ xfs_alloc_buftarg( ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, DEFAULT_RATELIMIT_BURST); - if (xfs_setsize_buftarg_early(btp)) - goto error_free; - if (list_lru_init(&btp->bt_lru)) goto error_free; -- cgit v1.2.3 From 1c51ac0998ed9baaca3ac75c0083b4c3b4d993ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:42:45 -0800 Subject: xfs: move setting bt_logical_sectorsize out of xfs_setsize_buftarg bt_logical_sectorsize and the associated mask is set based on the constant logical block size in the block_device structure and thus doesn't need to be updated in xfs_setsize_buftarg. Move it into xfs_alloc_buftarg so that it is only done once per buftarg. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e83cadea239f..ff243e3176a5 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1994,10 +1994,6 @@ xfs_setsize_buftarg( return -EINVAL; } - /* Set up device logical sector size mask */ - btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); - btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; - return 0; } @@ -2028,6 +2024,10 @@ xfs_alloc_buftarg( if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) goto error_free; + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + /* * Buffer IO error rate limiting. Limit it to no more than 10 messages * per 30 seconds so as to not spam logs too much on repeated errors. -- cgit v1.2.3 From e7b58f7c1be20550d4f51cec6307b811e7555f52 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:42:58 -0800 Subject: xfs: teach buftargs to maintain their own buffer hashtable Currently, cached buffers are indexed by per-AG hashtables. This works great for the data device, but won't work for in-memory btrees. To handle that use case, buftargs will need to be able to index buffers independently of other data structures. We accomplish this by hoisting the rhashtable and its lock into a separate xfs_buf_cache structure, make the buftarg point to the _buf_cache structure, and rework various functions to use it. This will enable the in-memory buftarg to come up with its own _buf_cache. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 6 ++-- fs/xfs/libxfs/xfs_ag.h | 4 +-- fs/xfs/xfs_buf.c | 84 +++++++++++++++++++++++++++++++++----------------- fs/xfs/xfs_buf.h | 8 +++++ fs/xfs/xfs_mount.h | 3 -- 5 files changed, 67 insertions(+), 38 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index e598e14320be..32d80a76440c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -264,7 +264,7 @@ xfs_free_perag( xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_hash_destroy(pag); + xfs_buf_cache_destroy(&pag->pag_bcache); /* drop the mount's active reference */ xfs_perag_rele(pag); @@ -352,7 +352,7 @@ xfs_free_unused_perag_range( spin_unlock(&mp->m_perag_lock); if (!pag) break; - xfs_buf_hash_destroy(pag); + xfs_buf_cache_destroy(&pag->pag_bcache); xfs_defer_drain_free(&pag->pag_intents_drain); kfree(pag); } @@ -419,7 +419,7 @@ xfs_initialize_perag( pag->pagb_tree = RB_ROOT; #endif /* __KERNEL__ */ - error = xfs_buf_hash_init(pag); + error = xfs_buf_cache_init(&pag->pag_bcache); if (error) goto out_remove_pag; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 19eddba09894..29bfa6273dec 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -106,9 +106,7 @@ struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ - struct rhashtable pag_buf_hash; + struct xfs_buf_cache pag_bcache; /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index ff243e3176a5..6b979dfa339a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -510,18 +510,18 @@ static const struct rhashtable_params xfs_buf_hash_params = { }; int -xfs_buf_hash_init( - struct xfs_perag *pag) +xfs_buf_cache_init( + struct xfs_buf_cache *bch) { - spin_lock_init(&pag->pag_buf_lock); - return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); + spin_lock_init(&bch->bc_lock); + return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); } void -xfs_buf_hash_destroy( - struct xfs_perag *pag) +xfs_buf_cache_destroy( + struct xfs_buf_cache *bch) { - rhashtable_destroy(&pag->pag_buf_hash); + rhashtable_destroy(&bch->bc_hash); } static int @@ -584,7 +584,7 @@ xfs_buf_find_lock( static inline int xfs_buf_lookup( - struct xfs_perag *pag, + struct xfs_buf_cache *bch, struct xfs_buf_map *map, xfs_buf_flags_t flags, struct xfs_buf **bpp) @@ -593,7 +593,7 @@ xfs_buf_lookup( int error; rcu_read_lock(); - bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); + bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { rcu_read_unlock(); return -ENOENT; @@ -618,6 +618,7 @@ xfs_buf_lookup( static int xfs_buf_find_insert( struct xfs_buftarg *btp, + struct xfs_buf_cache *bch, struct xfs_perag *pag, struct xfs_buf_map *cmap, struct xfs_buf_map *map, @@ -646,18 +647,18 @@ xfs_buf_find_insert( goto out_free_buf; } - spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, + spin_lock(&bch->bc_lock); + bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, &new_bp->b_rhash_head, xfs_buf_hash_params); if (IS_ERR(bp)) { error = PTR_ERR(bp); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); goto out_free_buf; } if (bp) { /* found an existing buffer */ atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); error = xfs_buf_find_lock(bp, flags); if (error) xfs_buf_rele(bp); @@ -668,17 +669,36 @@ xfs_buf_find_insert( /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); *bpp = new_bp; return 0; out_free_buf: xfs_buf_free(new_bp); out_drop_pag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } +static inline struct xfs_perag * +xfs_buftarg_get_pag( + struct xfs_buftarg *btp, + const struct xfs_buf_map *map) +{ + struct xfs_mount *mp = btp->bt_mount; + + return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); +} + +static inline struct xfs_buf_cache * +xfs_buftarg_buf_cache( + struct xfs_buftarg *btp, + struct xfs_perag *pag) +{ + return &pag->pag_bcache; +} + /* * Assembles a buffer covering the specified range. The code is optimised for * cache hits, as metadata intensive workloads will see 3 orders of magnitude @@ -692,6 +712,7 @@ xfs_buf_get_map( xfs_buf_flags_t flags, struct xfs_buf **bpp) { + struct xfs_buf_cache *bch; struct xfs_perag *pag; struct xfs_buf *bp = NULL; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; @@ -707,10 +728,10 @@ xfs_buf_get_map( if (error) return error; - pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); + pag = xfs_buftarg_get_pag(btp, &cmap); + bch = xfs_buftarg_buf_cache(btp, pag); - error = xfs_buf_lookup(pag, &cmap, flags, &bp); + error = xfs_buf_lookup(bch, &cmap, flags, &bp); if (error && error != -ENOENT) goto out_put_perag; @@ -722,13 +743,14 @@ xfs_buf_get_map( goto out_put_perag; /* xfs_buf_find_insert() consumes the perag reference. */ - error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, + error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, flags, &bp); if (error) return error; } else { XFS_STATS_INC(btp->bt_mount, xb_get_locked); - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); } /* We do not hold a perag reference anymore. */ @@ -756,7 +778,8 @@ xfs_buf_get_map( return 0; out_put_perag: - xfs_perag_put(pag); + if (pag) + xfs_perag_put(pag); return error; } @@ -1016,7 +1039,9 @@ static void xfs_buf_rele_cached( struct xfs_buf *bp) { + struct xfs_buftarg *btp = bp->b_target; struct xfs_perag *pag = bp->b_pag; + struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); bool release; bool freebuf = false; @@ -1035,7 +1060,7 @@ xfs_buf_rele_cached( * leading to a use-after-free scenario. */ spin_lock(&bp->b_lock); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1056,11 +1081,11 @@ xfs_buf_rele_cached( * buffer for the LRU and clear the (now stale) dispose list * state flag */ - if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { + if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { bp->b_state &= ~XFS_BSTATE_DISPOSE; atomic_inc(&bp->b_hold); } - spin_unlock(&pag->pag_buf_lock); + spin_unlock(&bch->bc_lock); } else { /* * most of the time buffers will already be removed from the @@ -1069,16 +1094,17 @@ xfs_buf_rele_cached( * was on was the disposal list */ if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { - list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); + list_lru_del_obj(&btp->bt_lru, &bp->b_lru); } else { ASSERT(list_empty(&bp->b_lru)); } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); + rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, + xfs_buf_hash_params); + spin_unlock(&bch->bc_lock); + if (pag) + xfs_perag_put(pag); freebuf = true; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b9216dee7810..7b01df6dcd50 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t; #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ #define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ +struct xfs_buf_cache { + spinlock_t bc_lock; + struct rhashtable bc_hash; +}; + +int xfs_buf_cache_init(struct xfs_buf_cache *bch); +void xfs_buf_cache_destroy(struct xfs_buf_cache *bch); + /* * The xfs_buftarg contains 2 notions of "sector size" - * diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6c44e6db4d86..e880aa48de68 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -505,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } -int xfs_buf_hash_init(struct xfs_perag *pag); -void xfs_buf_hash_destroy(struct xfs_perag *pag); - extern void xfs_uuid_table_free(void); extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); -- cgit v1.2.3 From 5076a6040ca1613e616d84aecfaac5f932db84e0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:21 -0800 Subject: xfs: support in-memory buffer cache targets Allow the buffer cache to target in-memory files by making it possible to have a buftarg that maps pages from private shmem files. As the prevous patch alludes, the in-memory buftarg contains its own cache, points to a shmem file, and does not point to a block_device. The next few patches will make it possible to construct an xfs_btree in pageable memory by using this buftarg. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Kconfig | 4 ++ fs/xfs/Makefile | 1 + fs/xfs/xfs_buf.c | 132 ++++++++++++++++++++++------------- fs/xfs/xfs_buf.h | 9 +++ fs/xfs/xfs_buf_mem.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_buf_mem.h | 30 ++++++++ fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 49 +++++++++++++ 8 files changed, 369 insertions(+), 46 deletions(-) create mode 100644 fs/xfs/xfs_buf_mem.c create mode 100644 fs/xfs/xfs_buf_mem.h diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index fa7eb3e2a248..7017ea0fb4cd 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -128,6 +128,9 @@ config XFS_LIVE_HOOKS bool select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL +config XFS_MEMORY_BUFS + bool + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n @@ -135,6 +138,7 @@ config XFS_ONLINE_SCRUB depends on TMPFS && SHMEM select XFS_LIVE_HOOKS select XFS_DRAIN_INTENTS + select XFS_MEMORY_BUFS help If you say Y here you will be able to check metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index ba8608f469ac..045874235b82 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -137,6 +137,7 @@ endif xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o +xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6b979dfa339a..7fc26e64368d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -21,6 +21,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_ag.h" +#include "xfs_buf_mem.h" struct kmem_cache *xfs_buf_cache; @@ -318,7 +319,9 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & _XBF_PAGES) + if (xfs_buftarg_is_mem(bp->b_target)) + xmbuf_unmap_page(bp); + else if (bp->b_flags & _XBF_PAGES) xfs_buf_free_pages(bp); else if (bp->b_flags & _XBF_KMEM) kfree(bp->b_addr); @@ -634,18 +637,20 @@ xfs_buf_find_insert( if (error) goto out_drop_pag; - /* - * For buffers that fit entirely within a single page, first attempt to - * allocate the memory from the heap to minimise memory usage. If we - * can't get heap memory for these small buffers, we fall back to using - * the page allocator. - */ - if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { + if (xfs_buftarg_is_mem(new_bp->b_target)) { + error = xmbuf_map_page(new_bp); + } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || + xfs_buf_alloc_kmem(new_bp, flags) < 0) { + /* + * For buffers that fit entirely within a single page, first + * attempt to allocate the memory from the heap to minimise + * memory usage. If we can't get heap memory for these small + * buffers, we fall back to using the page allocator. + */ error = xfs_buf_alloc_pages(new_bp, flags); - if (error) - goto out_free_buf; } + if (error) + goto out_free_buf; spin_lock(&bch->bc_lock); bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, @@ -688,6 +693,8 @@ xfs_buftarg_get_pag( { struct xfs_mount *mp = btp->bt_mount; + if (xfs_buftarg_is_mem(btp)) + return NULL; return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); } @@ -696,7 +703,9 @@ xfs_buftarg_buf_cache( struct xfs_buftarg *btp, struct xfs_perag *pag) { - return &pag->pag_bcache; + if (pag) + return &pag->pag_bcache; + return btp->bt_cache; } /* @@ -926,6 +935,13 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; + /* + * Currently we don't have a good means or justification for performing + * xmbuf_map_page asynchronously, so we don't do readahead. + */ + if (xfs_buftarg_is_mem(target)) + return; + xfs_buf_read_map(target, map, nmaps, XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, __this_address); @@ -991,7 +1007,10 @@ xfs_buf_get_uncached( if (error) return error; - error = xfs_buf_alloc_pages(bp, flags); + if (xfs_buftarg_is_mem(bp->b_target)) + error = xmbuf_map_page(bp); + else + error = xfs_buf_alloc_pages(bp, flags); if (error) goto fail_free_buf; @@ -1633,6 +1652,12 @@ _xfs_buf_ioapply( /* we only use the buffer cache for meta-data */ op |= REQ_META; + /* in-memory targets are directly mapped, no IO required. */ + if (xfs_buftarg_is_mem(bp->b_target)) { + xfs_buf_ioend(bp); + return; + } + /* * Walk all the vectors issuing IO on them. Set up the initial offset * into the buffer and the desired IO size before we start - @@ -1988,19 +2013,24 @@ xfs_buftarg_shrink_count( } void -xfs_free_buftarg( +xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); +} +void +xfs_free_buftarg( + struct xfs_buftarg *btp) +{ + xfs_destroy_buftarg(btp); fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) bdev_release(btp->bt_bdev_handle); - kfree(btp); } @@ -2023,6 +2053,45 @@ xfs_setsize_buftarg( return 0; } +int +xfs_init_buftarg( + struct xfs_buftarg *btp, + size_t logical_sectorsize, + const char *descr) +{ + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = logical_sectorsize; + btp->bt_logical_sectormask = logical_sectorsize - 1; + + /* + * Buffer IO error rate limiting. Limit it to no more than 10 messages + * per 30 seconds so as to not spam logs too much on repeated errors. + */ + ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, + DEFAULT_RATELIMIT_BURST); + + if (list_lru_init(&btp->bt_lru)) + return -ENOMEM; + if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + goto out_destroy_lru; + + btp->bt_shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); + if (!btp->bt_shrinker) + goto out_destroy_io_count; + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + shrinker_register(btp->bt_shrinker); + return 0; + +out_destroy_io_count: + percpu_counter_destroy(&btp->bt_io_count); +out_destroy_lru: + list_lru_destroy(&btp->bt_lru); + return -ENOMEM; +} + struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, @@ -2049,41 +2118,12 @@ xfs_alloc_buftarg( */ if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) goto error_free; - - /* Set up device logical sector size mask */ - btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); - btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; - - /* - * Buffer IO error rate limiting. Limit it to no more than 10 messages - * per 30 seconds so as to not spam logs too much on repeated errors. - */ - ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, - DEFAULT_RATELIMIT_BURST); - - if (list_lru_init(&btp->bt_lru)) + if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), + mp->m_super->s_id)) goto error_free; - if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) - goto error_lru; - - btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", - mp->m_super->s_id); - if (!btp->bt_shrinker) - goto error_pcpu; - - btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker->private_data = btp; - - shrinker_register(btp->bt_shrinker); - return btp; -error_pcpu: - percpu_counter_destroy(&btp->bt_io_count); -error_lru: - list_lru_destroy(&btp->bt_lru); error_free: kfree(btp); return NULL; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 7b01df6dcd50..73249abca968 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -109,6 +109,7 @@ struct xfs_buftarg { struct bdev_handle *bt_bdev_handle; struct block_device *bt_bdev; struct dax_device *bt_daxdev; + struct file *bt_file; u64 bt_dax_part_off; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; @@ -122,6 +123,9 @@ struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + + /* built-in cache, if we're not using the perag one */ + struct xfs_buf_cache bt_cache[]; }; #define XB_PAGES 2 @@ -387,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); +/* for xfs_buf_mem.c only: */ +int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize, + const char *descr); +void xfs_destroy_buftarg(struct xfs_buftarg *btp); + #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c new file mode 100644 index 000000000000..be71ba1a3d7b --- /dev/null +++ b/fs/xfs/xfs_buf_mem.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_buf.h" +#include "xfs_buf_mem.h" +#include "xfs_trace.h" +#include + +/* + * Buffer Cache for In-Memory Files + * ================================ + * + * Online fsck wants to create ephemeral ordered recordsets. The existing + * btree infrastructure can do this, but we need the buffer cache to target + * memory instead of block devices. + * + * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those + * requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to + * store our staging data. This file is not installed in the file descriptor + * table so that user programs cannot access the data, which means that the + * xmbuf must be freed with xmbuf_destroy. + * + * xmbufs assume that the caller will handle all required concurrency + * management; standard vfs locks (freezer and inode) are not taken. Reads + * and writes are satisfied directly from the page cache. + * + * The only supported block size is PAGE_SIZE, and we cannot use highmem. + */ + +/* + * shmem files used to back an in-memory buffer cache must not be exposed to + * userspace. Upper layers must coordinate access to the one handle returned + * by the constructor, so establish a separate lock class for xmbufs to avoid + * confusing lockdep. + */ +static struct lock_class_key xmbuf_i_mutex_key; + +/* + * Allocate a buffer cache target for a memory-backed file and set up the + * buffer target. + */ +int +xmbuf_alloc( + struct xfs_mount *mp, + const char *descr, + struct xfs_buftarg **btpp) +{ + struct file *file; + struct inode *inode; + struct xfs_buftarg *btp; + int error; + + btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL); + if (!btp) + return -ENOMEM; + + file = shmem_kernel_file_setup(descr, 0, 0); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_btp; + } + inode = file_inode(file); + + /* private file, private locking */ + lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key); + + /* + * We don't want to bother with kmapping data during repair, so don't + * allow highmem pages to back this mapping. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); + + /* ensure all writes are below EOF to avoid pagecache zeroing */ + i_size_write(inode, inode->i_sb->s_maxbytes); + + trace_xmbuf_create(btp); + + error = xfs_buf_cache_init(btp->bt_cache); + if (error) + goto out_file; + + /* Initialize buffer target */ + btp->bt_mount = mp; + btp->bt_dev = (dev_t)-1U; + btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */ + btp->bt_file = file; + btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE; + btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1; + + error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr); + if (error) + goto out_bcache; + + *btpp = btp; + return 0; + +out_bcache: + xfs_buf_cache_destroy(btp->bt_cache); +out_file: + fput(file); +out_free_btp: + kfree(btp); + return error; +} + +/* Free a buffer cache target for a memory-backed buffer cache. */ +void +xmbuf_free( + struct xfs_buftarg *btp) +{ + ASSERT(xfs_buftarg_is_mem(btp)); + ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + + trace_xmbuf_free(btp); + + xfs_destroy_buftarg(btp); + xfs_buf_cache_destroy(btp->bt_cache); + fput(btp->bt_file); + kfree(btp); +} + +/* Directly map a shmem page into the buffer cache. */ +int +xmbuf_map_page( + struct xfs_buf *bp) +{ + struct inode *inode = file_inode(bp->b_target->bt_file); + struct folio *folio = NULL; + struct page *page; + loff_t pos = BBTOB(xfs_buf_daddr(bp)); + int error; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + if (bp->b_map_count != 1) + return -ENOMEM; + if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE) + return -ENOMEM; + if (offset_in_page(pos) != 0) { + ASSERT(offset_in_page(pos)); + return -ENOMEM; + } + + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE); + if (error) + return error; + + if (filemap_check_wb_err(inode->i_mapping, 0)) { + folio_unlock(folio); + folio_put(folio); + return -EIO; + } + + page = folio_file_page(folio, pos >> PAGE_SHIFT); + + /* + * Mark the page dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xmbuf_unmap_page. + */ + set_page_dirty(page); + unlock_page(page); + + bp->b_addr = page_address(page); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = page; + bp->b_page_count = 1; + return 0; +} + +/* Unmap a shmem page that was mapped into the buffer cache. */ +void +xmbuf_unmap_page( + struct xfs_buf *bp) +{ + struct page *page = bp->b_pages[0]; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + put_page(page); + + bp->b_addr = NULL; + bp->b_pages[0] = NULL; + bp->b_pages = NULL; + bp->b_page_count = 0; +} diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h new file mode 100644 index 000000000000..945f4b610998 --- /dev/null +++ b/fs/xfs/xfs_buf_mem.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_BUF_MEM_H__ +#define __XFS_BUF_MEM_H__ + +#define XMBUF_BLOCKSIZE (PAGE_SIZE) +#define XMBUF_BLOCKSHIFT (PAGE_SHIFT) + +#ifdef CONFIG_XFS_MEMORY_BUFS +static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp) +{ + return btp->bt_bdev == NULL; +} + +int xmbuf_alloc(struct xfs_mount *mp, const char *descr, + struct xfs_buftarg **btpp); +void xmbuf_free(struct xfs_buftarg *btp); + +int xmbuf_map_page(struct xfs_buf *bp); +void xmbuf_unmap_page(struct xfs_buf *bp); +#else +# define xfs_buftarg_is_mem(...) (false) +# define xmbuf_map_page(...) (-ENOMEM) +# define xmbuf_unmap_page(...) ((void)0) +#endif /* CONFIG_XFS_MEMORY_BUFS */ + +#endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 8a5dc1538aa8..ae5be6b589f0 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -36,6 +36,7 @@ #include "xfs_error.h" #include #include "xfs_iomap.h" +#include "xfs_buf_mem.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e876a47f1427..14cb8752e3d3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4514,6 +4514,55 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents); #endif /* CONFIG_XFS_DRAIN_INTENTS */ +#ifdef CONFIG_XFS_MEMORY_BUFS +TRACE_EVENT(xmbuf_create, + TP_PROTO(struct xfs_buftarg *btp), + TP_ARGS(btp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, ino) + __array(char, pathname, 256) + ), + TP_fast_assign( + char pathname[257]; + char *path; + struct file *file = btp->bt_file; + + __entry->ino = file_inode(file)->i_ino; + memset(pathname, 0, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname) - 1); + if (IS_ERR(path)) + path = "(unknown)"; + strncpy(__entry->pathname, path, sizeof(__entry->pathname)); + ), + TP_printk("xmino 0x%lx path '%s'", + __entry->ino, + __entry->pathname) +); + +TRACE_EVENT(xmbuf_free, + TP_PROTO(struct xfs_buftarg *btp), + TP_ARGS(btp), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long long, bytes) + __field(loff_t, size) + ), + TP_fast_assign( + struct file *file = btp->bt_file; + struct inode *inode = file_inode(file); + + __entry->size = i_size_read(inode); + __entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes; + __entry->ino = inode->i_ino; + ), + TP_printk("xmino 0x%lx mem_bytes 0x%llx isize 0x%llx", + __entry->ino, + __entry->bytes, + __entry->size) +); +#endif /* CONFIG_XFS_MEMORY_BUFS */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 8c1771c45dfa9dddd4569727c48204b66073d2c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Feb 2024 12:43:34 -0800 Subject: xfs: add a xfs_btree_ptrs_equal helper This only has a single caller and thus might be a bit questionable, but I think it really improves the readability of xfs_btree_visit_block. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_btree.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ecdd5ea27a03..aa60a46b8eec 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1043,6 +1043,17 @@ xfs_btree_set_ptr_null( ptr->s = cpu_to_be32(NULLAGBLOCK); } +static inline bool +xfs_btree_ptrs_equal( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr1, + union xfs_btree_ptr *ptr2) +{ + if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + return ptr1->l == ptr2->l; + return ptr1->s == ptr2->s; +} + /* * Get/set/init sibling pointers */ @@ -4365,7 +4376,7 @@ xfs_btree_visit_block( { struct xfs_btree_block *block; struct xfs_buf *bp; - union xfs_btree_ptr rptr; + union xfs_btree_ptr rptr, bufptr; int error; /* do right sibling readahead */ @@ -4388,19 +4399,12 @@ xfs_btree_visit_block( * return the same block without checking if the right sibling points * back to us and creates a cyclic reference in the btree. */ - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { - if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))) { - xfs_btree_mark_sick(cur); - return -EFSCORRUPTED; - } - } else { - if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, - xfs_buf_daddr(bp))) { - xfs_btree_mark_sick(cur); - return -EFSCORRUPTED; - } + xfs_btree_buf_to_ptr(cur, bp, &bufptr); + if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) { + xfs_btree_mark_sick(cur); + return -EFSCORRUPTED; } + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } -- cgit v1.2.3 From a095686a2383526d7315197e2419d84ee8470217 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:35 -0800 Subject: xfs: support in-memory btrees Adapt the generic btree cursor code to be able to create a btree whose buffers come from a (presumably in-memory) buftarg with a header block that's specific to in-memory btrees. We'll connect this to other parts of online scrub in the next patches. Note that in-memory btrees always have a block size matching the system memory page size for efficiency reasons. There are also a few things we need to do to finalize a btree update; that's covered in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- .../filesystems/xfs/xfs-online-fsck-design.rst | 5 +- fs/xfs/Kconfig | 4 + fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_btree.c | 256 ++++++++++++++++++--- fs/xfs/libxfs/xfs_btree.h | 7 + fs/xfs/libxfs/xfs_btree_mem.c | 228 ++++++++++++++++++ fs/xfs/libxfs/xfs_btree_mem.h | 72 ++++++ fs/xfs/scrub/scrub.c | 5 + fs/xfs/scrub/scrub.h | 3 + fs/xfs/xfs_buf_mem.c | 13 ++ fs/xfs/xfs_buf_mem.h | 2 + fs/xfs/xfs_health.c | 3 + fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 117 +++++++++- 14 files changed, 675 insertions(+), 42 deletions(-) create mode 100644 fs/xfs/libxfs/xfs_btree_mem.c create mode 100644 fs/xfs/libxfs/xfs_btree_mem.h diff --git a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst index d03480266e9b..6333697ba3e8 100644 --- a/Documentation/filesystems/xfs/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -2270,13 +2270,12 @@ follows: pointing to the xfile. 3. Pass the buffer cache target, buffer ops, and other information to - ``xfbtree_create`` to write an initial tree header and root block to the - xfile. + ``xfbtree_init`` to initialize the passed in ``struct xfbtree`` and write an + initial root block to the xfile. Each btree type should define a wrapper that passes necessary arguments to the creation function. For example, rmap btrees define ``xfs_rmapbt_mem_create`` to take care of all the necessary details for callers. - A ``struct xfbtree`` object will be returned. 4. Pass the xfbtree object to the btree cursor creation function for the btree type. diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 7017ea0fb4cd..d41edd30388b 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -131,6 +131,9 @@ config XFS_LIVE_HOOKS config XFS_MEMORY_BUFS bool +config XFS_BTREE_IN_MEM + bool + config XFS_ONLINE_SCRUB bool "XFS online metadata check support" default n @@ -173,6 +176,7 @@ config XFS_ONLINE_REPAIR bool "XFS online metadata repair support" default n depends on XFS_FS && XFS_ONLINE_SCRUB + select XFS_BTREE_IN_MEM help If you say Y here you will be able to repair metadata on a mounted XFS filesystem. This feature is intended to reduce diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 045874235b82..9a6574da8de5 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -138,6 +138,7 @@ endif xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o +xfs-$(CONFIG_XFS_BTREE_IN_MEM) += libxfs/xfs_btree_mem.o # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index aa60a46b8eec..d29547572a68 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -28,6 +28,8 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_health.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" /* * Btree magic numbers. @@ -75,6 +77,25 @@ xfs_btree_check_fsblock_siblings( return NULL; } +static inline xfs_failaddr_t +xfs_btree_check_memblock_siblings( + struct xfs_buftarg *btp, + xfbno_t bno, + __be64 dsibling) +{ + xfbno_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == bno) + return __this_address; + if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling))) + return __this_address; + return NULL; +} + static inline xfs_failaddr_t xfs_btree_check_agblock_siblings( struct xfs_perag *pag, @@ -164,6 +185,34 @@ __xfs_btree_check_fsblock( return fa; } +/* + * Check an in-memory btree block header. Return the address of the failing + * check, or NULL if everything is ok. + */ +static xfs_failaddr_t +__xfs_btree_check_memblock( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level, + struct xfs_buf *bp) +{ + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; + xfs_failaddr_t fa; + xfbno_t bno; + + fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp); + if (fa) + return fa; + + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + return fa; +} + /* * Check a short btree block header. Return the address of the failing check, * or NULL if everything is ok. @@ -216,9 +265,17 @@ __xfs_btree_check_block( int level, struct xfs_buf *bp) { - if (cur->bc_ops->type == XFS_BTREE_TYPE_AG) + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return __xfs_btree_check_memblock(cur, block, level, bp); + case XFS_BTREE_TYPE_AG: return __xfs_btree_check_agblock(cur, block, level, bp); - return __xfs_btree_check_fsblock(cur, block, level, bp); + case XFS_BTREE_TYPE_INODE: + return __xfs_btree_check_fsblock(cur, block, level, bp); + default: + ASSERT(0); + return __this_address; + } } static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur) @@ -262,14 +319,22 @@ __xfs_btree_check_ptr( if (level <= 0) return -EFSCORRUPTED; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + if (!xfbtree_verify_bno(cur->bc_mem.xfbtree, + be64_to_cpu((&ptr->l)[index]))) + return -EFSCORRUPTED; + break; + case XFS_BTREE_TYPE_INODE: if (!xfs_verify_fsbno(cur->bc_mp, be64_to_cpu((&ptr->l)[index]))) return -EFSCORRUPTED; - } else { + break; + case XFS_BTREE_TYPE_AG: if (!xfs_verify_agbno(cur->bc_ag.pag, be32_to_cpu((&ptr->s)[index]))) return -EFSCORRUPTED; + break; } return 0; @@ -290,17 +355,26 @@ xfs_btree_check_ptr( error = __xfs_btree_check_ptr(cur, ptr, index, level); if (error) { - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + xfs_err(cur->bc_mp, +"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.", + cur->bc_ops->name, cur->bc_flags, level, index, + __this_address); + break; + case XFS_BTREE_TYPE_INODE: xfs_err(cur->bc_mp, "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.", cur->bc_ino.ip->i_ino, cur->bc_ino.whichfork, cur->bc_ops->name, level, index); - } else { + break; + case XFS_BTREE_TYPE_AG: xfs_err(cur->bc_mp, "AG %u: Corrupt %sbt pointer at level %d index %d.", cur->bc_ag.pag->pag_agno, cur->bc_ops->name, level, index); + break; } xfs_btree_mark_sick(cur); } @@ -457,11 +531,35 @@ xfs_btree_del_cursor( case XFS_BTREE_TYPE_INODE: /* nothing to do */ break; + case XFS_BTREE_TYPE_MEM: + if (cur->bc_mem.pag) + xfs_perag_put(cur->bc_mem.pag); + break; } kmem_cache_free(cur->bc_cache, cur); } +/* Return the buffer target for this btree's buffer. */ +static inline struct xfs_buftarg * +xfs_btree_buftarg( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return cur->bc_mem.xfbtree->target; + return cur->bc_mp->m_ddev_targp; +} + +/* Return the block size (in units of 512b sectors) for this btree. */ +static inline unsigned int +xfs_btree_bbsize( + struct xfs_btree_cur *cur) +{ + if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM) + return XFBNO_BBSIZE; + return cur->bc_mp->m_bsize; +} + /* * Duplicate the btree cursor. * Allocate a new one, copy the record, re-get the buffers. @@ -505,10 +603,11 @@ xfs_btree_dup_cursor( new->bc_levels[i].ra = cur->bc_levels[i].ra; bp = cur->bc_levels[i].bp; if (bp) { - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - xfs_buf_daddr(bp), mp->m_bsize, - 0, &bp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, tp, + xfs_btree_buftarg(cur), + xfs_buf_daddr(bp), + xfs_btree_bbsize(cur), 0, &bp, + cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(new); if (error) { @@ -885,6 +984,32 @@ xfs_btree_readahead_fsblock( return rval; } +STATIC int +xfs_btree_readahead_memblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target; + xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + int rval = 0; + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) { + xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) { + xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE, + cur->bc_ops->buf_ops); + rval++; + } + + return rval; +} + STATIC int xfs_btree_readahead_agblock( struct xfs_btree_cur *cur, @@ -939,9 +1064,17 @@ xfs_btree_readahead( cur->bc_levels[lev].ra |= lr; block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: + return xfs_btree_readahead_agblock(cur, lr, block); + case XFS_BTREE_TYPE_INODE: return xfs_btree_readahead_fsblock(cur, lr, block); - return xfs_btree_readahead_agblock(cur, lr, block); + case XFS_BTREE_TYPE_MEM: + return xfs_btree_readahead_memblock(cur, lr, block); + default: + ASSERT(0); + return 0; + } } STATIC int @@ -950,23 +1083,24 @@ xfs_btree_ptr_to_daddr( const union xfs_btree_ptr *ptr, xfs_daddr_t *daddr) { - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; int error; error = xfs_btree_check_ptr(cur, ptr, 0, 1); if (error) return error; - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) { - fsbno = be64_to_cpu(ptr->l); - *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno); - } else { - agbno = be32_to_cpu(ptr->s); + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno, - agbno); + be32_to_cpu(ptr->s)); + break; + case XFS_BTREE_TYPE_INODE: + *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + break; + case XFS_BTREE_TYPE_MEM: + *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l)); + break; } - return 0; } @@ -986,8 +1120,9 @@ xfs_btree_readahead_ptr( if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr)) return; - xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr, - cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); + xfs_buf_readahead(xfs_btree_buftarg(cur), daddr, + xfs_btree_bbsize(cur) * count, + cur->bc_ops->buf_ops); } /* @@ -1172,9 +1307,17 @@ static inline __u64 xfs_btree_owner( struct xfs_btree_cur *cur) { - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + return cur->bc_mem.xfbtree->owner; + case XFS_BTREE_TYPE_INODE: return cur->bc_ino.ip->i_ino; - return cur->bc_ag.pag->pag_agno; + case XFS_BTREE_TYPE_AG: + return cur->bc_ag.pag->pag_agno; + default: + ASSERT(0); + return 0; + } } void @@ -1218,12 +1361,18 @@ xfs_btree_buf_to_ptr( struct xfs_buf *bp, union xfs_btree_ptr *ptr) { - if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) - ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(bp))); - else { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_AG: ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_INODE: + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))); + break; + case XFS_BTREE_TYPE_MEM: + ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp))); + break; } } @@ -1242,15 +1391,14 @@ xfs_btree_get_buf_block( struct xfs_btree_block **block, struct xfs_buf **bpp) { - struct xfs_mount *mp = cur->bc_mp; - xfs_daddr_t d; - int error; + xfs_daddr_t d; + int error; error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize, - 0, bpp); + error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), 0, bpp); if (error) return error; @@ -1281,9 +1429,9 @@ xfs_btree_read_buf_block( error = xfs_btree_ptr_to_daddr(cur, ptr, &d); if (error) return error; - error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, - mp->m_bsize, flags, bpp, - cur->bc_ops->buf_ops); + error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d, + xfs_btree_bbsize(cur), flags, bpp, + cur->bc_ops->buf_ops); if (xfs_metadata_is_sick(error)) xfs_btree_mark_sick(cur); if (error) @@ -4582,6 +4730,8 @@ xfs_btree_fsblock_verify( xfs_fsblock_t fsb; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; @@ -4596,6 +4746,36 @@ xfs_btree_fsblock_verify( return fa; } +/* Verify an in-memory btree block. */ +xfs_failaddr_t +xfs_btree_memblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buftarg *btp = bp->b_target; + xfs_failaddr_t fa; + xfbno_t bno; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return __this_address; + + /* sibling pointer verification */ + bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp)); + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_leftsib); + if (fa) + return fa; + fa = xfs_btree_check_memblock_siblings(btp, bno, + block->bb_u.l.bb_rightsib); + if (fa) + return fa; + + return NULL; +} /** * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format * btree block @@ -4637,6 +4817,8 @@ xfs_btree_agblock_verify( xfs_agblock_t agbno; xfs_failaddr_t fa; + ASSERT(!xfs_buftarg_is_mem(bp->b_target)); + /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index bacd67cc8ced..f93374278aa1 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -112,6 +112,7 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y) enum xfs_btree_type { XFS_BTREE_TYPE_AG, XFS_BTREE_TYPE_INODE, + XFS_BTREE_TYPE_MEM, }; struct xfs_btree_ops { @@ -281,6 +282,10 @@ struct xfs_btree_cur struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ } bc_ag; + struct { + struct xfbtree *xfbtree; + struct xfs_perag *pag; + } bc_mem; }; /* per-format private data */ @@ -455,6 +460,8 @@ xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp, uint64_t owner); xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp, unsigned int max_recs); +xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp, + unsigned int max_recs); unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, unsigned long long records); diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c new file mode 100644 index 000000000000..cb156e9363a5 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_btree.h" +#include "xfs_error.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_ag.h" +#include "xfs_buf_item.h" +#include "xfs_trace.h" + +/* Set the root of an in-memory btree. */ +void +xfbtree_set_root( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + cur->bc_mem.xfbtree->root = *ptr; + cur->bc_mem.xfbtree->nlevels += inc; +} + +/* Initialize a pointer from the in-memory btree header. */ +void +xfbtree_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + *ptr = cur->bc_mem.xfbtree->root; +} + +/* Duplicate an in-memory btree cursor. */ +struct xfs_btree_cur * +xfbtree_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *ncur; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops, + cur->bc_maxlevels, cur->bc_cache); + ncur->bc_flags = cur->bc_flags; + ncur->bc_nlevels = cur->bc_nlevels; + ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree; + + if (cur->bc_mem.pag) + ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag); + + return ncur; +} + +/* Close the btree xfile and release all resources. */ +void +xfbtree_destroy( + struct xfbtree *xfbt) +{ + xfs_buftarg_drain(xfbt->target); +} + +/* Compute the number of bytes available for records. */ +static inline unsigned int +xfbtree_rec_bytes( + struct xfs_mount *mp, + const struct xfs_btree_ops *ops) +{ + return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; +} + +/* Initialize an empty leaf block as the btree root. */ +STATIC int +xfbtree_init_leaf_block( + struct xfs_mount *mp, + struct xfbtree *xfbt, + const struct xfs_btree_ops *ops) +{ + struct xfs_buf *bp; + xfbno_t bno = xfbt->highest_bno++; + int error; + + error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE, + &bp); + if (error) + return error; + + trace_xfbtree_create_root_buf(xfbt, bp); + + bp->b_ops = ops->buf_ops; + xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner); + xfs_buf_relse(bp); + + xfbt->root.l = cpu_to_be64(bno); + return 0; +} + +/* + * Create an in-memory btree root that can be used with the given xmbuf. + * Callers must set xfbt->owner. + */ +int +xfbtree_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + const struct xfs_btree_ops *ops) +{ + unsigned int blocklen = xfbtree_rec_bytes(mp, ops); + unsigned int keyptr_len; + int error; + + /* Requires a long-format CRC-format btree */ + if (!xfs_has_crc(mp)) { + ASSERT(xfs_has_crc(mp)); + return -EINVAL; + } + if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) { + ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN); + return -EINVAL; + } + + memset(xfbt, 0, sizeof(*xfbt)); + xfbt->target = btp; + + /* Set up min/maxrecs for this btree. */ + keyptr_len = ops->key_len + sizeof(__be64); + xfbt->maxrecs[0] = blocklen / ops->rec_len; + xfbt->maxrecs[1] = blocklen / keyptr_len; + xfbt->minrecs[0] = xfbt->maxrecs[0] / 2; + xfbt->minrecs[1] = xfbt->maxrecs[1] / 2; + xfbt->highest_bno = 0; + xfbt->nlevels = 1; + + /* Initialize the empty btree. */ + error = xfbtree_init_leaf_block(mp, xfbt, ops); + if (error) + goto err_freesp; + + trace_xfbtree_init(mp, xfbt, ops); + + return 0; + +err_freesp: + xfs_buftarg_drain(xfbt->target); + return error; +} + +/* Allocate a block to our in-memory btree. */ +int +xfbtree_alloc_block( + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfbno_t bno = xfbt->highest_bno++; + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_alloc_block(xfbt, cur, bno); + + /* Fail if the block address exceeds the maximum for the buftarg. */ + if (!xfbtree_verify_bno(xfbt, bno)) { + ASSERT(xfbtree_verify_bno(xfbt, bno)); + *stat = 0; + return 0; + } + + new->l = cpu_to_be64(bno); + *stat = 1; + return 0; +} + +/* Free a block from our in-memory btree. */ +int +xfbtree_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + xfs_daddr_t daddr = xfs_buf_daddr(bp); + xfbno_t bno = xfs_daddr_to_xfbno(daddr); + + ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM); + + trace_xfbtree_free_block(xfbt, cur, bno); + + if (bno + 1 == xfbt->highest_bno) + xfbt->highest_bno--; + + return 0; +} + +/* Return the minimum number of records for a btree block. */ +int +xfbtree_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->minrecs[level != 0]; +} + +/* Return the maximum number of records for a btree block. */ +int +xfbtree_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + struct xfbtree *xfbt = cur->bc_mem.xfbtree; + + return xfbt->maxrecs[level != 0]; +} diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h new file mode 100644 index 000000000000..ecc2ceac3ed4 --- /dev/null +++ b/fs/xfs/libxfs/xfs_btree_mem.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_BTREE_MEM_H__ +#define __XFS_BTREE_MEM_H__ + +typedef uint64_t xfbno_t; + +#define XFBNO_BLOCKSIZE (XMBUF_BLOCKSIZE) +#define XFBNO_BBSHIFT (XMBUF_BLOCKSHIFT - BBSHIFT) +#define XFBNO_BBSIZE (XFBNO_BLOCKSIZE >> BBSHIFT) + +static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno) +{ + return blkno << XFBNO_BBSHIFT; +} + +static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr) +{ + return daddr >> XFBNO_BBSHIFT; +} + +struct xfbtree { + /* buffer cache target for this in-memory btree */ + struct xfs_buftarg *target; + + /* Highest block number that has been written to. */ + xfbno_t highest_bno; + + /* Owner of this btree. */ + unsigned long long owner; + + /* Btree header */ + union xfs_btree_ptr root; + unsigned int nlevels; + + /* Minimum and maximum records per block. */ + unsigned int maxrecs[2]; + unsigned int minrecs[2]; +}; + +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno) +{ + return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno)); +} + +void xfbtree_set_root(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int inc); +void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); +struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur); + +int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level); +int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level); + +int xfbtree_alloc_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr, + int *stat); +int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp); + +/* Callers must set xfbt->target and xfbt->owner before calling this */ +int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt, + struct xfs_buftarg *btp, const struct xfs_btree_ops *ops); +void xfbtree_destroy(struct xfbtree *xfbt); +#else +# define xfbtree_verify_bno(...) (false) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_BTREE_MEM_H__ */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index aeac9cae4ad4..6828e72824fb 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -15,6 +15,7 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_scrub.h" +#include "xfs_buf_mem.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -190,6 +191,10 @@ xchk_teardown( sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); } + if (sc->xmbtp) { + xmbuf_free(sc->xmbtp); + sc->xmbtp = NULL; + } if (sc->xfile) { xfile_destroy(sc->xfile); sc->xfile = NULL; diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index f99a3c21d02e..1247284c17a0 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -99,6 +99,9 @@ struct xfs_scrub { /* xfile used by the scrubbers; freed at teardown. */ struct xfile *xfile; + /* buffer target for in-memory btrees; also freed at teardown. */ + struct xfs_buftarg *xmbtp; + /* Lock flags for @ip. */ uint ilock_flags; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index be71ba1a3d7b..73caa2ea8b18 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -187,3 +187,16 @@ xmbuf_unmap_page( bp->b_pages = NULL; bp->b_page_count = 0; } + +/* Is this a valid daddr within the buftarg? */ +bool +xmbuf_verify_daddr( + struct xfs_buftarg *btp, + xfs_daddr_t daddr) +{ + struct inode *inode = file_inode(btp->bt_file); + + ASSERT(xfs_buftarg_is_mem(btp)); + + return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); +} diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h index 945f4b610998..dfb3029113ff 100644 --- a/fs/xfs/xfs_buf_mem.h +++ b/fs/xfs/xfs_buf_mem.h @@ -21,10 +21,12 @@ void xmbuf_free(struct xfs_buftarg *btp); int xmbuf_map_page(struct xfs_buf *bp); void xmbuf_unmap_page(struct xfs_buf *bp); +bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); #else # define xfs_buftarg_is_mem(...) (false) # define xmbuf_map_page(...) (-ENOMEM) # define xmbuf_unmap_page(...) ((void)0) +# define xmbuf_verify_daddr(...) (false) #endif /* CONFIG_XFS_MEMORY_BUFS */ #endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index b28546b6fe34..b39f959146bc 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -527,6 +527,9 @@ xfs_btree_mark_sick( struct xfs_btree_cur *cur) { switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_MEM: + /* no health state tracking for ephemeral btrees */ + return; case XFS_BTREE_TYPE_AG: ASSERT(cur->bc_ops->sick_mask); xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index ae5be6b589f0..3f253884fe5b 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -37,6 +37,7 @@ #include #include "xfs_iomap.h" #include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 14cb8752e3d3..4971d650bd6b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -79,6 +79,8 @@ union xfs_btree_ptr; struct xfs_dqtrx; struct xfs_icwalk; struct xfs_perag; +struct xfbtree; +struct xfs_btree_ops; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -2499,12 +2501,19 @@ TRACE_EVENT(xfs_btree_alloc_block, ), TP_fast_assign( __entry->dev = cur->bc_mp->m_super->s_dev; - if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { + switch (cur->bc_ops->type) { + case XFS_BTREE_TYPE_INODE: __entry->agno = 0; __entry->ino = cur->bc_ino.ip->i_ino; - } else { + break; + case XFS_BTREE_TYPE_AG: __entry->agno = cur->bc_ag.pag->pag_agno; __entry->ino = 0; + break; + case XFS_BTREE_TYPE_MEM: + __entry->agno = 0; + __entry->ino = 0; + break; } __assign_str(name, cur->bc_ops->name); __entry->error = error; @@ -4563,6 +4572,110 @@ TRACE_EVENT(xmbuf_free, ); #endif /* CONFIG_XFS_MEMORY_BUFS */ +#ifdef CONFIG_XFS_BTREE_IN_MEM +TRACE_EVENT(xfbtree_init, + TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt, + const struct xfs_btree_ops *ops), + TP_ARGS(mp, xfbt, ops), + TP_STRUCT__entry( + __field(const void *, btree_ops) + __field(unsigned long, xfino) + __field(unsigned int, leaf_mxr) + __field(unsigned int, leaf_mnr) + __field(unsigned int, node_mxr) + __field(unsigned int, node_mnr) + __field(unsigned long long, owner) + ), + TP_fast_assign( + __entry->btree_ops = ops; + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __entry->leaf_mxr = xfbt->maxrecs[0]; + __entry->node_mxr = xfbt->maxrecs[1]; + __entry->leaf_mnr = xfbt->minrecs[0]; + __entry->node_mnr = xfbt->minrecs[1]; + __entry->owner = xfbt->owner; + ), + TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u", + __entry->xfino, + __entry->btree_ops, + __entry->owner, + __entry->leaf_mxr, + __entry->leaf_mnr, + __entry->node_mxr, + __entry->node_mnr) +); + +DECLARE_EVENT_CLASS(xfbtree_buf_class, + TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), + TP_ARGS(xfbt, bp), + TP_STRUCT__entry( + __field(unsigned long, xfino) + __field(xfs_daddr_t, bno) + __field(int, nblks) + __field(int, hold) + __field(int, pincount) + __field(unsigned int, lockval) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __entry->bno = xfs_buf_daddr(bp); + __entry->nblks = bp->b_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->flags = bp->b_flags; + ), + TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s", + __entry->xfino, + (unsigned long long)__entry->bno, + __entry->nblks, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS)) +) + +#define DEFINE_XFBTREE_BUF_EVENT(name) \ +DEFINE_EVENT(xfbtree_buf_class, name, \ + TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \ + TP_ARGS(xfbt, bp)) +DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf); +DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf); +DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf); + +DECLARE_EVENT_CLASS(xfbtree_freesp_class, + TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, + xfs_fileoff_t fileoff), + TP_ARGS(xfbt, cur, fileoff), + TP_STRUCT__entry( + __field(unsigned long, xfino) + __string(btname, cur->bc_ops->name) + __field(int, nlevels) + __field(xfs_fileoff_t, fileoff) + ), + TP_fast_assign( + __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; + __assign_str(btname, cur->bc_ops->name); + __entry->nlevels = cur->bc_nlevels; + __entry->fileoff = fileoff; + ), + TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx", + __entry->xfino, + __get_str(btname), + __entry->nlevels, + (unsigned long long)__entry->fileoff) +) + +#define DEFINE_XFBTREE_FREESP_EVENT(name) \ +DEFINE_EVENT(xfbtree_freesp_class, name, \ + TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \ + xfs_fileoff_t fileoff), \ + TP_ARGS(xfbt, cur, fileoff)) +DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block); +DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block); +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 5049ff4d140c8f6545464811409302cab017321a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:36 -0800 Subject: xfs: create a helper to decide if a file mapping targets the rt volume Create a helper so that we can stop open-coding this decision everywhere. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 6 +++--- fs/xfs/libxfs/xfs_inode_fork.c | 9 +++++++++ fs/xfs/libxfs/xfs_inode_fork.h | 1 + fs/xfs/scrub/bmap.c | 2 +- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2e0546eb8010..b96807a53905 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4913,7 +4913,7 @@ xfs_bmap_del_extent_delay( XFS_STATS_INC(mp, xs_del_exlist); - isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + isrt = xfs_ifork_is_realtime(ip, whichfork); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got->br_startoff + got->br_blockcount; da_old = startblockval(got->br_startblock); @@ -5149,7 +5149,7 @@ xfs_bmap_del_extent_real( return -ENOSPC; *logflagsp = XFS_ILOG_CORE; - if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { + if (xfs_ifork_is_realtime(ip, whichfork)) { if (!(bflags & XFS_BMAPI_REMAP)) { error = xfs_rtfree_blocks(tp, del->br_startblock, del->br_blockcount); @@ -5396,7 +5396,7 @@ __xfs_bunmapi( return 0; } XFS_STATS_INC(mp, xs_blk_unmap); - isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); + isrt = xfs_ifork_is_realtime(ip, whichfork); end = start + len; if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 83194edcb0ba..3ab0ea133557 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -813,3 +813,12 @@ xfs_iext_count_upgrade( return 0; } + +/* Decide if a file mapping is on the realtime device or not. */ +bool +xfs_ifork_is_realtime( + struct xfs_inode *ip, + int whichfork) +{ + return XFS_IS_REALTIME_INODE(ip) && whichfork != XFS_ATTR_FORK; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 96303249d28a..bd53eb951b65 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -260,6 +260,7 @@ int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork, int nr_to_add); int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, uint nr_to_add); +bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork); /* returns true if the fork has extents but they are not read in yet. */ static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index b169cddde6da..24a15bf784f1 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -924,7 +924,7 @@ xchk_bmap( if (!ifp) return -ENOENT; - info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip); + info.is_rt = xfs_ifork_is_realtime(ip, whichfork); info.whichfork = whichfork; info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip); info.sc = sc; -- cgit v1.2.3 From 0dc63c8a1ce39c1ac7da536ee9174cdc714afae2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:36 -0800 Subject: xfs: launder in-memory btree buffers before transaction commit As we've noted in various places, all current users of in-memory btrees are online fsck. Online fsck only stages a btree long enough to rebuild an ondisk data structure, which means that the in-memory btree is ephemeral. Furthermore, if we encounter /any/ errors while updating an in-memory btree, all we do is tear down all the staged data and return an errno to userspace. In-memory btrees need not be transactional, so their buffers should not be committed to the ondisk log, nor should they be checkpointed by the AIL. That's just as well since the ephemeral nature of the btree means that the buftarg and the buffers may disappear quickly anyway. Therefore, we need a way to launder the btree buffers that get attached to the transaction by the generic btree code. Because the buffers are directly mapped to backing file pages, there's no need to bwrite them back to the tmpfs file. All we need to do is clean enough of the buffer log item state so that the bli can be detached from the buffer, remove the bli from the transaction's log item list, and reset the transaction dirty state as if the laundered items had never been there. For simplicity, create xfbtree transaction commit and cancel helpers that launder the in-memory btree buffers for callers. Once laundered, call the write verifier on non-stale buffers to avoid integrity issues, or punch a hole in the backing file for stale buffers. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_mem.c | 119 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_btree_mem.h | 3 ++ fs/xfs/xfs_buf_mem.c | 68 ++++++++++++++++++++++++ fs/xfs/xfs_buf_mem.h | 2 + fs/xfs/xfs_trace.h | 1 + fs/xfs/xfs_trans.h | 1 + fs/xfs/xfs_trans_buf.c | 42 +++++++++++++++ 7 files changed, 236 insertions(+) diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c index cb156e9363a5..036061fe32cc 100644 --- a/fs/xfs/libxfs/xfs_btree_mem.c +++ b/fs/xfs/libxfs/xfs_btree_mem.c @@ -226,3 +226,122 @@ xfbtree_get_maxrecs( return xfbt->maxrecs[level != 0]; } + +/* If this log item is a buffer item that came from the xfbtree, return it. */ +static inline struct xfs_buf * +xfbtree_buf_match( + struct xfbtree *xfbt, + const struct xfs_log_item *lip) +{ + const struct xfs_buf_log_item *bli; + struct xfs_buf *bp; + + if (lip->li_type != XFS_LI_BUF) + return NULL; + + bli = container_of(lip, struct xfs_buf_log_item, bli_item); + bp = bli->bli_buf; + if (bp->b_target != xfbt->target) + return NULL; + + return bp; +} + +/* + * Commit changes to the incore btree immediately by writing all dirty xfbtree + * buffers to the backing xfile. This detaches all xfbtree buffers from the + * transaction, even on failure. The buffer locks are dropped between the + * delwri queue and submit, so the caller must synchronize btree access. + * + * Normally we'd let the buffers commit with the transaction and get written to + * the xfile via the log, but online repair stages ephemeral btrees in memory + * and uses the btree_staging functions to write new btrees to disk atomically. + * The in-memory btree (and its backing store) are discarded at the end of the + * repair phase, which means that xfbtree buffers cannot commit with the rest + * of a transaction. + * + * In other words, online repair only needs the transaction to collect buffer + * pointers and to avoid buffer deadlocks, not to guarantee consistency of + * updates. + */ +int +xfbtree_trans_commit( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + int error = 0; + + /* + * For each xfbtree buffer attached to the transaction, write the dirty + * buffers to the xfile and release them. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_commit_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + + /* + * If the buffer fails verification, note the failure but + * continue walking the transaction items so that we remove all + * ephemeral btree buffers. + */ + if (!error) + error = xmbuf_finalize(bp); + + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); + + return error; +} + +/* + * Cancel changes to the incore btree by detaching all the xfbtree buffers. + * Changes are not undone, so callers must not access the btree ever again. + */ +void +xfbtree_trans_cancel( + struct xfbtree *xfbt, + struct xfs_trans *tp) +{ + struct xfs_log_item *lip, *n; + bool tp_dirty = false; + + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip); + + if (!bp) { + if (test_bit(XFS_LI_DIRTY, &lip->li_flags)) + tp_dirty |= true; + continue; + } + + trace_xfbtree_trans_cancel_buf(xfbt, bp); + + xmbuf_trans_bdetach(tp, bp); + xfs_buf_relse(bp); + } + + /* + * Reset the transaction's dirty flag to reflect the dirty state of the + * log items that are still attached. + */ + tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) | + (tp_dirty ? XFS_TRANS_DIRTY : 0); +} diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h index ecc2ceac3ed4..1c3825786ec8 100644 --- a/fs/xfs/libxfs/xfs_btree_mem.h +++ b/fs/xfs/libxfs/xfs_btree_mem.h @@ -65,6 +65,9 @@ int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp); int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt, struct xfs_buftarg *btp, const struct xfs_btree_ops *ops); void xfbtree_destroy(struct xfbtree *xfbt); + +int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp); +void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp); #else # define xfbtree_verify_bno(...) (false) #endif /* CONFIG_XFS_BTREE_IN_MEM */ diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 73caa2ea8b18..8ad38c64708e 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -9,6 +9,10 @@ #include "xfs_buf_mem.h" #include "xfs_trace.h" #include +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_error.h" /* * Buffer Cache for In-Memory Files @@ -200,3 +204,67 @@ xmbuf_verify_daddr( return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); } + +/* Discard the page backing this buffer. */ +static void +xmbuf_stale( + struct xfs_buf *bp) +{ + struct inode *inode = file_inode(bp->b_target->bt_file); + loff_t pos; + + ASSERT(xfs_buftarg_is_mem(bp->b_target)); + + pos = BBTOB(xfs_buf_daddr(bp)); + shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1); +} + +/* + * Finalize a buffer -- discard the backing page if it's stale, or run the + * write verifier to detect problems. + */ +int +xmbuf_finalize( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa; + int error = 0; + + if (bp->b_flags & XBF_STALE) { + xmbuf_stale(bp); + return 0; + } + + /* + * Although this btree is ephemeral, validate the buffer structure so + * that we can detect memory corruption errors and software bugs. + */ + fa = bp->b_ops->verify_struct(bp); + if (fa) { + error = -EFSCORRUPTED; + xfs_verifier_error(bp, error, fa); + } + + return error; +} + +/* + * Detach this xmbuf buffer from the transaction by any means necessary. + * All buffers are direct-mapped, so they do not need bwrite. + */ +void +xmbuf_trans_bdetach( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bli = bp->b_log_item; + + ASSERT(bli != NULL); + + bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED | + XFS_BLI_LOGGED | XFS_BLI_STALE); + clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags); + + while (bp->b_log_item != NULL) + xfs_trans_bdetach(tp, bp); +} diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h index dfb3029113ff..eed4a7b63232 100644 --- a/fs/xfs/xfs_buf_mem.h +++ b/fs/xfs/xfs_buf_mem.h @@ -22,6 +22,8 @@ void xmbuf_free(struct xfs_buftarg *btp); int xmbuf_map_page(struct xfs_buf *bp); void xmbuf_unmap_page(struct xfs_buf *bp); bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); +void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); +int xmbuf_finalize(struct xfs_buf *bp); #else # define xfs_buftarg_is_mem(...) (false) # define xmbuf_map_page(...) (-ENOMEM) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 4971d650bd6b..e85ea1fdb9c1 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -642,6 +642,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach); DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 08ce757c7454..3f7e3a09a49f 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -215,6 +215,7 @@ struct xfs_buf *xfs_trans_getsb(struct xfs_trans *); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); void xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *); void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 6549e50d852c..e28ab74af4f0 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -392,6 +392,48 @@ xfs_trans_brelse( xfs_buf_relse(bp); } +/* + * Forcibly detach a buffer previously joined to the transaction. The caller + * will retain its locked reference to the buffer after this function returns. + * The buffer must be completely clean and must not be held to the transaction. + */ +void +xfs_trans_bdetach( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_log_item; + + ASSERT(tp != NULL); + ASSERT(bp->b_transp == tp); + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + trace_xfs_trans_bdetach(bip); + + /* + * Erase all recursion count, since we're removing this buffer from the + * transaction. + */ + bip->bli_recur = 0; + + /* + * The buffer must be completely clean. Specifically, it had better + * not be dirty, stale, logged, ordered, or held to the transaction. + */ + ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); + ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY)); + ASSERT(!(bip->bli_flags & XFS_BLI_HOLD)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + + /* Unlink the log item from the transaction and drop the log item. */ + xfs_trans_del_item(&bip->bli_item); + xfs_buf_item_put(bip); + bp->b_transp = NULL; +} + /* * Mark the buffer as not needing to be unlocked when the buf item's * iop_committing() routine is called. The buffer must already be locked -- cgit v1.2.3 From e4fd1def30987bbf0fb00867d22cc2634b8dacf0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:37 -0800 Subject: xfs: create agblock bitmap helper to count the number of set regions In the next patch, the rmap btree repair code will need to estimate the size of the new ondisk rmapbt. The size is a function of the number of records that will be written to disk, and the size of the recordset is the number of observations made while scanning the filesystem plus the number of OWN_AG records that will be injected into the rmap btree. OWN_AG rmap records track the free space btrees, the AGFL, and the new rmap btree itself. The repair tool uses a bitmap to record the space used for all four structures, which is why we need a function to count the number of set regions. A reviewer requested that this be pulled into a separate patch with its own justification, so here it is. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/agb_bitmap.h | 5 +++++ fs/xfs/scrub/bitmap.c | 14 ++++++++++++++ fs/xfs/scrub/bitmap.h | 2 ++ 3 files changed, 21 insertions(+) diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h index ed08f76ff4f3..e488e1f4f63d 100644 --- a/fs/xfs/scrub/agb_bitmap.h +++ b/fs/xfs/scrub/agb_bitmap.h @@ -65,4 +65,9 @@ int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, struct xfs_btree_cur *cur); +static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b) +{ + return xbitmap32_count_set_regions(&b->agbitmap); +} + #endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 1449bb5262d9..0cb8d43912a8 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -566,3 +566,17 @@ xbitmap32_test( *len = bn->bn_start - start; return false; } + +/* Count the number of set regions in this bitmap. */ +uint32_t +xbitmap32_count_set_regions( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + uint32_t nr = 0; + + for_each_xbitmap32_extent(bn, bitmap) + nr++; + + return nr; +} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 2df8911606d6..710c1ac5e323 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -62,4 +62,6 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, bool xbitmap32_empty(struct xbitmap32 *bitmap); bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); +uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap); + #endif /* __XFS_SCRUB_BITMAP_H__ */ -- cgit v1.2.3 From 32080a9b9b2ef8f4089e8e28a2c307334431757e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:38 -0800 Subject: xfs: repair the rmapbt Rebuild the reverse mapping btree from all primary metadata. This first patch establishes the bare mechanics of finding records and putting together a new ondisk tree; more complex pieces are needed to make it work properly. Link: Documentation/filesystems/xfs-online-fsck-design.rst Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_ag.h | 1 + fs/xfs/libxfs/xfs_bmap.c | 43 ++ fs/xfs/libxfs/xfs_bmap.h | 8 + fs/xfs/libxfs/xfs_rmap.c | 12 +- fs/xfs/libxfs/xfs_rmap.h | 2 +- fs/xfs/libxfs/xfs_rmap_btree.c | 13 +- fs/xfs/scrub/common.c | 2 +- fs/xfs/scrub/common.h | 1 + fs/xfs/scrub/newbt.c | 12 +- fs/xfs/scrub/newbt.h | 7 + fs/xfs/scrub/reap.c | 2 +- fs/xfs/scrub/repair.c | 5 +- fs/xfs/scrub/repair.h | 6 +- fs/xfs/scrub/rmap.c | 11 +- fs/xfs/scrub/rmap_repair.c | 1466 ++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 33 +- 18 files changed, 1608 insertions(+), 19 deletions(-) create mode 100644 fs/xfs/scrub/rmap_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 9a6574da8de5..6de02b2573c3 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -201,6 +201,7 @@ xfs-y += $(addprefix scrub/, \ reap.o \ refcount_repair.o \ repair.o \ + rmap_repair.o \ ) xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 29bfa6273dec..e019b79dbbe3 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -90,6 +90,7 @@ struct xfs_perag { uint8_t pagf_repair_bno_level; uint8_t pagf_repair_cnt_level; uint8_t pagf_repair_refcount_level; + uint8_t pagf_repair_rmap_level; #endif spinlock_t pag_state_lock; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b96807a53905..06e329221cd5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6379,3 +6379,46 @@ xfs_bunmapi_range( out: return error; } + +struct xfs_bmap_query_range { + xfs_bmap_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_bmap_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_bmap_query_range *query = priv; + struct xfs_bmbt_irec irec; + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(&rec->bmbt, &irec); + fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork, + &irec); + if (fa) { + xfs_btree_mark_sick(cur); + return xfs_bmap_complain_bad_rec(cur->bc_ino.ip, + cur->bc_ino.whichfork, fa, &irec); + } + + return query->fn(cur, &irec, query->priv); +} + +/* Find all bmaps. */ +int +xfs_bmap_query_all( + struct xfs_btree_cur *cur, + xfs_bmap_query_range_fn fn, + void *priv) +{ + struct xfs_bmap_query_range query = { + .priv = priv, + .fn = fn, + }; + + return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query); +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index f6b73f1bad5f..10b85865204d 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -280,4 +280,12 @@ extern struct kmem_cache *xfs_bmap_intent_cache; int __init xfs_bmap_intent_init_cache(void); void xfs_bmap_intent_destroy_cache(void); +typedef int (*xfs_bmap_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv); + +int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, + void *priv); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index c9e12533c813..83bc84a87421 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -215,10 +215,10 @@ xfs_rmap_btrec_to_irec( /* Simple checks for rmap records. */ xfs_failaddr_t xfs_rmap_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_rmap_irec *irec) { - struct xfs_mount *mp = cur->bc_mp; + struct xfs_mount *mp = pag->pag_mount; bool is_inode; bool is_unwritten; bool is_bmbt; @@ -233,8 +233,8 @@ xfs_rmap_check_irec( return __this_address; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock, - irec->rm_blockcount)) + if (!xfs_verify_agbext(pag, irec->rm_startblock, + irec->rm_blockcount)) return __this_address; } @@ -307,7 +307,7 @@ xfs_rmap_get_rec( fa = xfs_rmap_btrec_to_irec(rec, irec); if (!fa) - fa = xfs_rmap_check_irec(cur, irec); + fa = xfs_rmap_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, irec); @@ -2442,7 +2442,7 @@ xfs_rmap_query_range_helper( fa = xfs_rmap_btrec_to_irec(rec, &irec); if (!fa) - fa = xfs_rmap_check_irec(cur, &irec); + fa = xfs_rmap_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 3c98d9d50afb..58c67896d12c 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -195,7 +195,7 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a, union xfs_btree_rec; xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); -xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag, const struct xfs_rmap_irec *irec); int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 0751268c102c..815e34e295dd 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -342,7 +342,18 @@ xfs_rmapbt_verify( level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_rmap_level) + unsigned int maxlevel = pag->pagf_rmap_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_rmap_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_rmap_maxlevels) return __this_address; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 70746a7db954..dea883632b2d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -460,7 +460,7 @@ xchk_perag_read_headers( * Grab the AG headers for the attached perag structure and wait for pending * intents to drain. */ -static int +int xchk_perag_drain_and_lock( struct xfs_scrub *sc) { diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 529a510dc76f..89f7bbec887e 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -134,6 +134,7 @@ int xchk_setup_nlinks(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); +int xchk_perag_drain_and_lock(struct xfs_scrub *sc); /* * Grab all AG resources, treating the inability to grab the perag structure as diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 608d7ab01d89..4a0271123d94 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -239,7 +239,11 @@ xrep_newbt_alloc_ag_blocks( xrep_newbt_validate_ag_alloc_hint(xnr); - error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_near_bno(&args, + xnr->alloc_hint); if (error) return error; if (args.fsbno == NULLFSBLOCK) @@ -309,7 +313,11 @@ xrep_newbt_alloc_file_blocks( xrep_newbt_validate_file_alloc_hint(xnr); - error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); + if (xnr->alloc_vextent) + error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint); + else + error = xfs_alloc_vextent_start_ag(&args, + xnr->alloc_hint); if (error) return error; if (args.fsbno == NULLFSBLOCK) diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index 89f8e3970b1f..3d804d31af24 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -6,6 +6,8 @@ #ifndef __XFS_SCRUB_NEWBT_H__ #define __XFS_SCRUB_NEWBT_H__ +struct xfs_alloc_arg; + struct xrep_newbt_resv { /* Link to list of extents that we've reserved. */ struct list_head list; @@ -28,6 +30,11 @@ struct xrep_newbt_resv { struct xrep_newbt { struct xfs_scrub *sc; + /* Custom allocation function, or NULL for xfs_alloc_vextent */ + int (*alloc_vextent)(struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint); + /* List of extents that we've reserved. */ struct list_head resv_list; diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index f99eca799809..0252a3b5b65a 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -114,7 +114,7 @@ xreap_put_freelist( int error; /* Make sure there's space on the freelist. */ - error = xrep_fix_freelist(sc, true); + error = xrep_fix_freelist(sc, 0); if (error) return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index d1a21f380abe..0c56dafd9ae4 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -401,7 +401,7 @@ xrep_calc_ag_resblks( int xrep_fix_freelist( struct xfs_scrub *sc, - bool can_shrink) + int alloc_flags) { struct xfs_alloc_arg args = {0}; @@ -411,8 +411,7 @@ xrep_fix_freelist( args.alignment = 1; args.pag = sc->sa.pag; - return xfs_alloc_fix_freelist(&args, - can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); + return xfs_alloc_fix_freelist(&args, alloc_flags); } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 2ff2bb79c540..c01e56799bd1 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -51,7 +51,7 @@ struct xbitmap; struct xagb_bitmap; struct xfsb_bitmap; -int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); +int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags); struct xrep_find_ag_btree { /* in: rmap owner of the btree we're looking for */ @@ -86,6 +86,7 @@ int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, int xrep_reset_perag_resv(struct xfs_scrub *sc); int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); int xrep_metadata_inode_forks(struct xfs_scrub *sc); +int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -111,6 +112,7 @@ int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_rmapbt(struct xfs_scrub *sc); int xrep_refcountbt(struct xfs_scrub *sc); int xrep_inode(struct xfs_scrub *sc); int xrep_bmap_data(struct xfs_scrub *sc); @@ -177,6 +179,7 @@ xrep_setup_nothing( return 0; } #define xrep_setup_ag_allocbt xrep_setup_nothing +#define xrep_setup_ag_rmapbt xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) @@ -190,6 +193,7 @@ xrep_setup_nothing( #define xrep_agi xrep_notsupported #define xrep_allocbt xrep_notsupported #define xrep_iallocbt xrep_notsupported +#define xrep_rmapbt xrep_notsupported #define xrep_refcountbt xrep_notsupported #define xrep_inode xrep_notsupported #define xrep_bmap_data xrep_notsupported diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 5afe6650ed6c..ba5bbc3fb754 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -25,6 +25,7 @@ #include "scrub/btree.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" +#include "scrub/repair.h" /* * Set us up to scrub reverse mapping btrees. @@ -36,6 +37,14 @@ xchk_setup_ag_rmapbt( if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_rmapbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } @@ -349,7 +358,7 @@ xchk_rmapbt_rec( struct xfs_rmap_irec irec; if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL || - xfs_rmap_check_irec(bs->cur, &irec) != NULL) { + xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c new file mode 100644 index 000000000000..120efb49bbfe --- /dev/null +++ b/fs/xfs/scrub/rmap_repair.c @@ -0,0 +1,1466 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2018-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/iscan.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Reverse Mapping Btree Repair + * ============================ + * + * This is the most involved of all the AG space btree rebuilds. Everywhere + * else in XFS we lock inodes and then AG data structures, but generating the + * list of rmap records requires that we be able to scan both block mapping + * btrees of every inode in the filesystem to see if it owns any extents in + * this AG. We can't tolerate any inode updates while we do this, so we + * freeze the filesystem to lock everyone else out, and grant ourselves + * special privileges to run transactions with regular background reclamation + * turned off. + * + * We also have to be very careful not to allow inode reclaim to start a + * transaction because all transactions (other than our own) will block. + * Deferred inode inactivation helps us out there. + * + * I) Reverse mappings for all non-space metadata and file data are collected + * according to the following algorithm: + * + * 1. For each fork of each inode: + * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary. + * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate + * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree + * block. + * 1.3. If the incore extent map is loaded but the fork is in btree format, + * just visit the bmbt blocks to set the corresponding BMBIT areas. + * 1.4. From the incore extent map, accumulate each bmap that falls into our + * target AG. Remember, multiple bmap records can map to a single rmap + * record, so we cannot simply emit rmap records 1:1. + * 1.5. Emit rmap records for each extent in BMBIT and free it. + * 2. Create bitmaps INOBIT and ICHUNKBIT. + * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT, + * and set bits in INOBIT for each btree block. If the inobt has no records + * at all, we must be careful to record its root in INOBIT. + * 4. For each block in the finobt, set the corresponding INOBIT area. + * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them. + * 6. Create bitmaps REFCBIT and COWBIT. + * 7. For each CoW staging extent in the refcountbt, set the corresponding + * areas in COWBIT. + * 8. For each block in the refcountbt, set the corresponding REFCBIT area. + * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them. + * A. Emit rmap for the AG headers. + * B. Emit rmap for the log, if there is one. + * + * II) The rmapbt shape and space metadata rmaps are computed as follows: + * + * 1. Count the rmaps collected in the previous step. (= NR) + * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB) + * 3. Reserve RMB blocks through the newbt using the allocator in normap mode. + * 4. Create bitmap AGBIT. + * 5. For each reservation in the newbt, set the corresponding areas in AGBIT. + * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT. + * 7. Count the extents in AGBIT. (= AGNR) + * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB') + * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB', + * and clear AGBIT. Go to step 5. + * A. Emit rmaps for each extent in AGBIT. + * + * III) The rmapbt is constructed and set in place as follows: + * + * 1. Sort the rmap records. + * 2. Bulk load the rmaps. + * + * IV) Reap the old btree blocks. + * + * 1. Create a bitmap OLDRMBIT. + * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT. + * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT. + * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are + * the parts of the AG that the rmap didn't find during its scan of the + * primary metadata and aren't known to be in the free space, which implies + * that they were the old rmapbt blocks. + * 5. Commit. + * + * We use the 'xrep_rmap' prefix for all the rmap functions. + */ + +/* + * Packed rmap record. The ATTR/BMBT/UNWRITTEN flags are hidden in the upper + * bits of offset, just like the on-disk record. + */ +struct xrep_rmap_extent { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; + uint64_t owner; + uint64_t offset; +} __packed; + +/* Context for collecting rmaps */ +struct xrep_rmap { + /* new rmapbt information */ + struct xrep_newbt new_btree; + + /* rmap records generated from primary metadata */ + struct xfarray *rmap_records; + + struct xfs_scrub *sc; + + /* get_records()'s position in the rmap record array. */ + xfarray_idx_t array_cur; + + /* inode scan cursor */ + struct xchk_iscan iscan; + + /* bnobt/cntbt contribution to btreeblks */ + xfs_agblock_t freesp_btblocks; + + /* old agf_rmap_blocks counter */ + unsigned int old_rmapbt_fsbcount; +}; + +/* Set us up to repair reverse mapping btrees. */ +int +xrep_setup_ag_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr; + + rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + + rr->sc = sc; + sc->buf = rr; + return 0; +} + +/* Make sure there's nothing funny about this mapping. */ +STATIC int +xrep_rmap_check_mapping( + struct xfs_scrub *sc, + const struct xfs_rmap_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Store a reverse-mapping record. */ +static inline int +xrep_rmap_stash( + struct xrep_rmap *rr, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + uint64_t owner, + uint64_t offset, + unsigned int flags) +{ + struct xrep_rmap_extent rre = { + .startblock = startblock, + .blockcount = blockcount, + .owner = owner, + }; + struct xfs_rmap_irec rmap = { + .rm_startblock = startblock, + .rm_blockcount = blockcount, + .rm_owner = owner, + .rm_offset = offset, + .rm_flags = flags, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + + rre.offset = xfs_rmap_irec_offset_pack(&rmap); + return xfarray_append(rr->rmap_records, &rre); +} + +struct xrep_rmap_stash_run { + struct xrep_rmap *rr; + uint64_t owner; + unsigned int rmap_flags; +}; + +static int +xrep_rmap_stash_run( + uint32_t start, + uint32_t len, + void *priv) +{ + struct xrep_rmap_stash_run *rsr = priv; + struct xrep_rmap *rr = rsr->rr; + + return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags); +} + +/* + * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure + * that the ranges are in units of FS blocks. + */ +STATIC int +xrep_rmap_stash_bitmap( + struct xrep_rmap *rr, + struct xagb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xrep_rmap_stash_run rsr = { + .rr = rr, + .owner = oinfo->oi_owner, + .rmap_flags = 0, + }; + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + rsr.rmap_flags |= XFS_RMAP_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK; + + return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr); +} + +/* Section (I): Finding all file and bmbt extents. */ + +/* Context for accumulating rmaps for an inode fork. */ +struct xrep_rmap_ifork { + /* + * Accumulate rmap data here to turn multiple adjacent bmaps into a + * single rmap. + */ + struct xfs_rmap_irec accum; + + /* Bitmap of bmbt blocks in this AG. */ + struct xagb_bitmap bmbt_blocks; + + struct xrep_rmap *rr; + + /* Which inode fork? */ + int whichfork; +}; + +/* Stash an rmap that we accumulated while walking an inode fork. */ +STATIC int +xrep_rmap_stash_accumulated( + struct xrep_rmap_ifork *rf) +{ + if (rf->accum.rm_blockcount == 0) + return 0; + + return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock, + rf->accum.rm_blockcount, rf->accum.rm_owner, + rf->accum.rm_offset, rf->accum.rm_flags); +} + +/* Accumulate a bmbt record. */ +STATIC int +xrep_rmap_visit_bmbt( + struct xfs_btree_cur *cur, + struct xfs_bmbt_irec *rec, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_mount *mp = rf->rr->sc->mp; + struct xfs_rmap_irec *accum = &rf->accum; + xfs_agblock_t agbno; + unsigned int rmap_flags = 0; + int error; + + if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) != + rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock); + if (rf->whichfork == XFS_ATTR_FORK) + rmap_flags |= XFS_RMAP_ATTR_FORK; + if (rec->br_state == XFS_EXT_UNWRITTEN) + rmap_flags |= XFS_RMAP_UNWRITTEN; + + /* If this bmap is adjacent to the previous one, just add it. */ + if (accum->rm_blockcount > 0 && + rec->br_startoff == accum->rm_offset + accum->rm_blockcount && + agbno == accum->rm_startblock + accum->rm_blockcount && + rmap_flags == accum->rm_flags) { + accum->rm_blockcount += rec->br_blockcount; + return 0; + } + + /* Otherwise stash the old rmap and start accumulating a new one. */ + error = xrep_rmap_stash_accumulated(rf); + if (error) + return error; + + accum->rm_startblock = agbno; + accum->rm_blockcount = rec->br_blockcount; + accum->rm_offset = rec->br_startoff; + accum->rm_flags = rmap_flags; + return 0; +} + +/* Add a btree block to the bitmap. */ +STATIC int +xrep_rmap_visit_iroot_btree_block( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xrep_rmap_ifork *rf = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno) + return 0; + + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1); +} + +/* + * Iterate a metadata btree rooted in an inode to collect rmap records for + * anything in this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iroot_btree( + struct xrep_rmap_ifork *rf, + struct xfs_btree_cur *cur) +{ + struct xfs_owner_info oinfo; + struct xrep_rmap *rr = rf->rr; + int error; + + xagb_bitmap_init(&rf->bmbt_blocks); + + /* Record all the blocks in the btree itself. */ + error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block, + XFS_BTREE_VISIT_ALL, rf); + if (error) + goto out; + + /* Emit rmaps for the btree blocks. */ + xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork); + error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo); + if (error) + goto out; + + /* Stash any remaining accumulated rmaps. */ + error = xrep_rmap_stash_accumulated(rf); +out: + xagb_bitmap_destroy(&rf->bmbt_blocks); + return error; +} + +static inline bool +is_rt_data_fork( + struct xfs_inode *ip, + int whichfork) +{ + return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK; +} + +/* + * Iterate the block mapping btree to collect rmap records for anything in this + * fork that matches the AG. Sets @mappings_done to true if we've scanned the + * block mappings in this fork. + */ +STATIC int +xrep_rmap_scan_bmbt( + struct xrep_rmap_ifork *rf, + struct xfs_inode *ip, + bool *mappings_done) +{ + struct xrep_rmap *rr = rf->rr; + struct xfs_btree_cur *cur; + struct xfs_ifork *ifp; + int error; + + *mappings_done = false; + ifp = xfs_ifork_ptr(ip, rf->whichfork); + cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork); + + if (!xfs_ifork_is_realtime(ip, rf->whichfork) && + xfs_need_iread_extents(ifp)) { + /* + * If the incore extent cache isn't loaded, scan the bmbt for + * mapping records. This avoids loading the incore extent + * tree, which will increase memory pressure at a time when + * we're trying to run as quickly as we possibly can. Ignore + * realtime extents. + */ + error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf); + if (error) + goto out_cur; + + *mappings_done = true; + } + + /* Scan for the bmbt blocks, which always live on the data device. */ + error = xrep_rmap_scan_iroot_btree(rf, cur); +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* + * Iterate the in-core extent cache to collect rmap records for anything in + * this fork that matches the AG. + */ +STATIC int +xrep_rmap_scan_iext( + struct xrep_rmap_ifork *rf, + struct xfs_ifork *ifp) +{ + struct xfs_bmbt_irec rec; + struct xfs_iext_cursor icur; + int error; + + for_each_xfs_iext(ifp, &icur, &rec) { + if (isnullstartblock(rec.br_startblock)) + continue; + error = xrep_rmap_visit_bmbt(NULL, &rec, rf); + if (error) + return error; + } + + return xrep_rmap_stash_accumulated(rf); +} + +/* Find all the extents from a given AG in an inode fork. */ +STATIC int +xrep_rmap_scan_ifork( + struct xrep_rmap *rr, + struct xfs_inode *ip, + int whichfork) +{ + struct xrep_rmap_ifork rf = { + .accum = { .rm_owner = ip->i_ino, }, + .rr = rr, + .whichfork = whichfork, + }; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + int error = 0; + + if (!ifp) + return 0; + + if (ifp->if_format == XFS_DINODE_FMT_BTREE) { + bool mappings_done; + + /* + * Scan the bmap btree for data device mappings. This includes + * the btree blocks themselves, even if this is a realtime + * file. + */ + error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done); + if (error || mappings_done) + return error; + } else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + return 0; + } + + /* Scan incore extent cache if this isn't a realtime file. */ + if (xfs_ifork_is_realtime(ip, whichfork)) + return 0; + + return xrep_rmap_scan_iext(&rf, ifp); +} + +/* + * Take ILOCK on a file that we want to scan. + * + * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded + * attr bmbt. Otherwise, take ILOCK_SHARED. + */ +static inline unsigned int +xrep_rmap_scan_ilock( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (xfs_need_iread_extents(&ip->i_df)) { + lock_mode = XFS_ILOCK_EXCL; + goto lock; + } + + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) + lock_mode = XFS_ILOCK_EXCL; + +lock: + xfs_ilock(ip, lock_mode); + return lock_mode; +} + +/* Record reverse mappings for a file. */ +STATIC int +xrep_rmap_scan_inode( + struct xrep_rmap *rr, + struct xfs_inode *ip) +{ + unsigned int lock_mode = 0; + int error; + + /* + * Directory updates (create/link/unlink/rename) drop the directory's + * ILOCK before finishing any rmapbt updates associated with directory + * shape changes. For this scan to coordinate correctly with the live + * update hook, we must take the only lock (i_rwsem) that is held all + * the way to dir op completion. This will get fixed by the parent + * pointer patchset. + */ + if (S_ISDIR(VFS_I(ip)->i_mode)) { + lock_mode = XFS_IOLOCK_SHARED; + xfs_ilock(ip, lock_mode); + } + lock_mode |= xrep_rmap_scan_ilock(ip); + + /* Check the data fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* Check the attr fork. */ + error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK); + if (error) + goto out_unlock; + + /* COW fork extents are "owned" by the refcount btree. */ + + xchk_iscan_mark_visited(&rr->iscan, ip); +out_unlock: + xfs_iunlock(ip, lock_mode); + return error; +} + +/* Section (I): Find all AG metadata extents except for free space metadata. */ + +struct xrep_rmap_inodes { + struct xrep_rmap *rr; + struct xagb_bitmap inobt_blocks; /* INOBIT */ + struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */ +}; + +/* Record inode btree rmaps. */ +STATIC int +xrep_rmap_walk_inobt( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_inobt_rec_incore irec; + struct xrep_rmap_inodes *ri = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agino_t agino; + xfs_agino_t iperhole; + unsigned int i; + int error; + + /* Record the inobt blocks. */ + error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur); + if (error) + return error; + + xfs_inobt_btrec_to_irec(mp, rec, &irec); + if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL) + return -EFSCORRUPTED; + + agino = irec.ir_startino; + + /* Record a non-sparse inode chunk. */ + if (!xfs_inobt_issparse(irec.ir_holemask)) { + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + aglen = max_t(xfs_extlen_t, 1, + XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock); + + return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + } + + /* Iterate each chunk. */ + iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock, + XFS_INODES_PER_HOLEMASK_BIT); + aglen = iperhole / mp->m_sb.sb_inopblock; + for (i = 0, agino = irec.ir_startino; + i < XFS_INOBT_HOLEMASK_BITS; + i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) { + /* Skip holes. */ + if (irec.ir_holemask & (1 << i)) + continue; + + /* Record the inode chunk otherwise. */ + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen); + if (error) + return error; + } + + return 0; +} + +/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */ +STATIC int +xrep_rmap_find_inode_rmaps( + struct xrep_rmap *rr) +{ + struct xrep_rmap_inodes ri = { + .rr = rr, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + xagb_bitmap_init(&ri.inobt_blocks); + xagb_bitmap_init(&ri.ichunk_blocks); + + /* + * Iterate every record in the inobt so we can capture all the inode + * chunks and the blocks in the inobt itself. + */ + error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri); + if (error) + goto out_bitmap; + + /* + * Note that if there are zero records in the inobt then query_all does + * nothing and we have to account the empty inobt root manually. + */ + if (xagb_bitmap_empty(&ri.ichunk_blocks)) { + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + + error = xagb_bitmap_set(&ri.inobt_blocks, + be32_to_cpu(agi->agi_root), 1); + if (error) + goto out_bitmap; + } + + /* Scan the finobt too. */ + if (xfs_has_finobt(sc->mp)) { + error = xagb_bitmap_set_btblocks(&ri.inobt_blocks, + sc->sa.fino_cur); + if (error) + goto out_bitmap; + } + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks, + &XFS_RMAP_OINFO_INOBT); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks, + &XFS_RMAP_OINFO_INODES); + +out_bitmap: + xagb_bitmap_destroy(&ri.inobt_blocks); + xagb_bitmap_destroy(&ri.ichunk_blocks); + return error; +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_rmap_walk_cowblocks( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *irec, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + + if (!xfs_refcount_check_domain(irec) || + irec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount); +} + +/* + * Collect rmaps for the blocks containing the refcount btree, and all CoW + * staging extents. + */ +STATIC int +xrep_rmap_find_refcount_rmaps( + struct xrep_rmap *rr) +{ + struct xagb_bitmap refcountbt_blocks; /* REFCBIT */ + struct xagb_bitmap cow_blocks; /* COWBIT */ + struct xfs_refcount_irec low = { + .rc_startblock = 0, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_refcount_irec high = { + .rc_startblock = -1U, + .rc_domain = XFS_REFC_DOMAIN_COW, + }; + struct xfs_scrub *sc = rr->sc; + int error; + + if (!xfs_has_reflink(sc->mp)) + return 0; + + xagb_bitmap_init(&refcountbt_blocks); + xagb_bitmap_init(&cow_blocks); + + /* refcountbt */ + error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur); + if (error) + goto out_bitmap; + + /* Collect rmaps for CoW staging extents. */ + error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high, + xrep_rmap_walk_cowblocks, &cow_blocks); + if (error) + goto out_bitmap; + + /* Generate rmaps for everything. */ + error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks, + &XFS_RMAP_OINFO_REFC); + +out_bitmap: + xagb_bitmap_destroy(&cow_blocks); + xagb_bitmap_destroy(&refcountbt_blocks); + return error; +} + +/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */ +STATIC int +xrep_rmap_find_agheader_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + /* Create a record for the AG sb->agfl. */ + return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp), + XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1, + XFS_RMAP_OWN_FS, 0, 0); +} + +/* Generate rmaps for the log, if it's in this AG. */ +STATIC int +xrep_rmap_find_log_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + + if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno)) + return 0; + + return xrep_rmap_stash(rr, + XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart), + sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); +} + +/* + * Generate all the reverse-mappings for this AG, a list of the old rmapbt + * blocks, and the new btreeblks count. Figure out if we have enough free + * space to reconstruct the inode btrees. The caller must clean up the lists + * if anything goes wrong. This implements section (I) above. + */ +STATIC int +xrep_rmap_find_rmaps( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xchk_ag *sa = &sc->sa; + struct xfs_inode *ip; + int error; + + /* Find all the per-AG metadata. */ + xrep_ag_btcur_init(sc, &sc->sa); + + error = xrep_rmap_find_inode_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_refcount_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_agheader_rmaps(rr); + if (error) + goto end_agscan; + + error = xrep_rmap_find_log_rmaps(rr); +end_agscan: + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* + * Set up for a potentially lengthy filesystem scan by reducing our + * transaction resource usage for the duration. Specifically: + * + * Unlock the AG header buffers and cancel the transaction to release + * the log grant space while we scan the filesystem. + * + * Create a new empty transaction to eliminate the possibility of the + * inode scan deadlocking on cyclical metadata. + * + * We pass the empty transaction to the file scanning function to avoid + * repeatedly cycling empty transactions. This can be done even though + * we take the IOLOCK to quiesce the file because empty transactions + * do not take sb_internal. + */ + sa->agf_bp = NULL; + sa->agi_bp = NULL; + xchk_trans_cancel(sc); + error = xchk_trans_alloc_empty(sc); + if (error) + return error; + + /* Iterate all AGs for inodes rmaps. */ + while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) { + error = xrep_rmap_scan_inode(rr, ip); + xchk_irele(sc, ip); + if (error) + break; + + if (xchk_should_terminate(sc, &error)) + break; + } + xchk_iscan_iter_finish(&rr->iscan); + if (error) + return error; + + /* + * Switch out for a real transaction and lock the AG headers in + * preparation for building a new tree. + */ + xchk_trans_cancel(sc); + error = xchk_setup_fs(sc); + if (error) + return error; + return xchk_perag_drain_and_lock(sc); +} + +/* Section (II): Reserving space for new rmapbt and setting free space bitmap */ + +struct xrep_rmap_agfl { + struct xagb_bitmap *bitmap; + xfs_agnumber_t agno; +}; + +/* Add an AGFL block to the rmap list. */ +STATIC int +xrep_rmap_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_rmap_agfl *ra = priv; + + return xagb_bitmap_set(ra->bitmap, agbno, 1); +} + +/* + * Run one round of reserving space for the new rmapbt and recomputing the + * number of blocks needed to store the previously observed rmapbt records and + * the ones we'll create for the free space metadata. When we don't need more + * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to + * true. + */ +STATIC int +xrep_rmap_try_reserve( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur, + uint64_t nr_records, + struct xagb_bitmap *freesp_blocks, + uint64_t *blocks_reserved, + bool *done) +{ + struct xrep_rmap_agfl ra = { + .bitmap = freesp_blocks, + .agno = rr->sc->sa.pag->pag_agno, + }; + struct xfs_scrub *sc = rr->sc; + struct xrep_newbt_resv *resv, *n; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + uint64_t nr_blocks; /* RMB */ + uint64_t freesp_records; + int error; + + /* + * We're going to recompute new_btree.bload.nr_blocks at the end of + * this function to reflect however many btree blocks we need to store + * all the rmap records (including the ones that reflect the changes we + * made to support the new rmapbt blocks), so we save the old value + * here so we can decide if we've reserved enough blocks. + */ + nr_blocks = rr->new_btree.bload.nr_blocks; + + /* + * Make sure we've reserved enough space for the new btree. This can + * change the shape of the free space btrees, which can cause secondary + * interactions with the rmap records because all three space btrees + * have the same rmap owner. We'll account for all that below. + */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + nr_blocks - *blocks_reserved); + if (error) + return error; + + *blocks_reserved = rr->new_btree.bload.nr_blocks; + + /* Clear everything in the bitmap. */ + xagb_bitmap_destroy(freesp_blocks); + + /* Set all the bnobt blocks in the bitmap. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + return error; + + /* Set all the cntbt blocks in the bitmap. */ + sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur); + xfs_btree_del_cursor(sc->sa.cnt_cur, error); + sc->sa.cnt_cur = NULL; + if (error) + return error; + + /* Record our new btreeblks value. */ + rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2; + + /* Set all the new rmapbt blocks in the bitmap. */ + list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) { + error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len); + if (error) + return error; + } + + /* Set all the AGFL blocks in the bitmap. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + return error; + + error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra); + if (error) + return error; + + /* Count the extents in the bitmap. */ + freesp_records = xagb_bitmap_count_set_regions(freesp_blocks); + + /* Compute how many blocks we'll need for all the rmaps. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, nr_records + freesp_records); + if (error) + return error; + + /* We're done when we don't need more blocks. */ + *done = nr_blocks >= rr->new_btree.bload.nr_blocks; + return 0; +} + +/* + * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for + * the free space metadata. This implements section (II) above. + */ +STATIC int +xrep_rmap_reserve_space( + struct xrep_rmap *rr, + struct xfs_btree_cur *rmap_cur) +{ + struct xagb_bitmap freesp_blocks; /* AGBIT */ + uint64_t nr_records; /* NR */ + uint64_t blocks_reserved = 0; + bool done = false; + int error; + + nr_records = xfarray_length(rr->rmap_records); + + /* Compute how many blocks we'll need for the rmaps collected so far. */ + error = xfs_btree_bload_compute_geometry(rmap_cur, + &rr->new_btree.bload, nr_records); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(rr->sc, &error)) + return error; + + xagb_bitmap_init(&freesp_blocks); + + /* + * Iteratively reserve space for the new rmapbt and recompute the + * number of blocks needed to store the previously observed rmapbt + * records and the ones we'll create for the free space metadata. + * Finish when we don't need more blocks. + */ + do { + error = xrep_rmap_try_reserve(rr, rmap_cur, nr_records, + &freesp_blocks, &blocks_reserved, &done); + if (error) + goto out_bitmap; + } while (!done); + + /* Emit rmaps for everything in the free space bitmap. */ + xrep_ag_btcur_init(rr->sc, &rr->sc->sa); + error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG); + xchk_ag_btcur_free(&rr->sc->sa); + +out_bitmap: + xagb_bitmap_destroy(&freesp_blocks); + return error; +} + +/* Section (III): Building the new rmap btree. */ + +/* Update the AGF counters. */ +STATIC int +xrep_rmap_reset_counters( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + xfs_agblock_t rmap_btblocks; + + /* + * The AGF header contains extra information related to the reverse + * mapping btree, so we must update those fields here. + */ + rmap_btblocks = rr->new_btree.afake.af_blocks - 1; + agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the rmapbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_rmap_level = pag->pagf_rmap_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* Retrieve rmapbt data for bulk load. */ +STATIC int +xrep_rmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xrep_rmap_extent rec; + struct xfs_rmap_irec *irec = &cur->bc_rec.r; + struct xrep_rmap *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load_next(rr->rmap_records, &rr->array_cur, + &rec); + if (error) + return error; + + irec->rm_startblock = rec.startblock; + irec->rm_blockcount = rec.blockcount; + irec->rm_owner = rec.owner; + if (xfs_rmap_irec_offset_unpack(rec.offset, irec) != NULL) + return -EFSCORRUPTED; + + error = xrep_rmap_check_mapping(rr->sc, irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_rmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_rmap *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Custom allocation function for new rmap btrees. */ +STATIC int +xrep_rmap_alloc_vextent( + struct xfs_scrub *sc, + struct xfs_alloc_arg *args, + xfs_fsblock_t alloc_hint) +{ + int error; + + /* + * We don't want an rmap update on the allocation, since we iteratively + * compute the OWN_AG records /after/ allocating blocks for the records + * that we already know we need to store. Therefore, fix the freelist + * with the NORMAP flag set so that we don't also try to create an rmap + * for new AGFL blocks. + */ + error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP); + if (error) + return error; + + /* + * If xrep_fix_freelist fixed the freelist by moving blocks from the + * free space btrees or by removing blocks from the AGFL and queueing + * an EFI to free the block, the transaction will be dirty. This + * second case is of interest to us. + * + * Later on, we will need to compare gaps in the new recordset against + * the block usage of all OWN_AG owners in order to free the old + * btree's blocks, which means that we can't have EFIs for former AGFL + * blocks attached to the repair transaction when we commit the new + * btree. + * + * xrep_newbt_alloc_blocks guarantees this for us by calling + * xrep_defer_finish to commit anything that fix_freelist may have + * added to the transaction. + */ + return xfs_alloc_vextent_near_bno(args, alloc_hint); +} + +/* + * Use the collected rmap information to stage a new rmap btree. If this is + * successful we'll return with the new btree root information logged to the + * repair transaction but not yet committed. This implements section (III) + * above. + */ +STATIC int +xrep_rmap_build_new_tree( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_btree_cur *rmap_cur; + xfs_fsblock_t fsbno; + int error; + + /* + * Preserve the old rmapbt block count so that we can adjust the + * per-AG rmapbt reservation after we commit the new btree root and + * want to dispose of the old btree blocks. + */ + rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks); + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. The new blocks are accounted to the + * rmapbt per-AG reservation, which we will adjust further after + * committing the new btree. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE, + fsbno, XFS_AG_RESV_RMAPBT); + rr->new_btree.bload.get_records = xrep_rmap_get_records; + rr->new_btree.bload.claim_block = xrep_rmap_claim_block; + rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent; + rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag); + xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake); + + /* + * Initialize @rr->new_btree, reserve space for the new rmapbt, + * and compute OWN_AG rmaps. + */ + error = xrep_rmap_reserve_space(rr, rmap_cur); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; + + /* Add all observed rmap records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(rmap_cur, 0); + + /* + * The newly committed rmap recordset includes mappings for the blocks + * that we reserved to build the new btree. If there is excess space + * reservation to be freed, the corresponding rmap records must also be + * removed. + */ + rr->new_btree.oinfo = XFS_RMAP_OINFO_AG; + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_rmap_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_rmap_level = 0; +err_cur: + xfs_btree_del_cursor(rmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* Section (IV): Reaping the old btree. */ + +struct xrep_rmap_find_gaps { + struct xagb_bitmap rmap_gaps; + xfs_agblock_t next_agbno; +}; + +/* Subtract each free extent in the bnobt from the rmap gaps. */ +STATIC int +xrep_rmap_find_freesp( + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + + return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock, + rec->ar_blockcount); +} + +/* + * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make + * a list of gaps in the rmap records and a list of the extents mentioned in + * the bnobt. Any block that's in the new rmapbt gap list but not mentioned + * in the bnobt is a block from the old rmapbt and can be removed. + */ +STATIC int +xrep_rmap_remove_old_tree( + struct xrep_rmap *rr) +{ + struct xrep_rmap_find_gaps rfg = { + .next_agbno = 0, + }; + struct xfs_scrub *sc = rr->sc; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_perag *pag = sc->sa.pag; + xfs_agblock_t agend; + xfarray_idx_t array_cur; + int error; + + xagb_bitmap_init(&rfg.rmap_gaps); + + /* Compute free space from the new rmapbt. */ + foreach_xfarray_idx(rr->rmap_records, array_cur) { + struct xrep_rmap_extent rec; + + error = xfarray_load(rr->rmap_records, array_cur, &rec); + if (error) + goto out_bitmap; + + /* Record the free space we find. */ + if (rec.startblock > rfg.next_agbno) { + error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, + rec.startblock - rfg.next_agbno); + if (error) + goto out_bitmap; + } + rfg.next_agbno = max_t(xfs_agblock_t, rfg.next_agbno, + rec.startblock + rec.blockcount); + } + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (rfg.next_agbno < agend) { + error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, + agend - rfg.next_agbno); + if (error) + goto out_bitmap; + } + + /* Compute free space from the existing bnobt. */ + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp, + &rfg); + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + if (error) + goto out_bitmap; + + /* + * Free the "free" blocks that the new rmapbt knows about but the bnobt + * doesn't--these are the old rmapbt blocks. Credit the old rmapbt + * block usage count back to the per-AG rmapbt reservation (and not + * fdblocks, since the rmap btree lives in free space) to keep the + * reservation and free space accounting correct. + */ + error = xrep_reap_agblocks(sc, &rfg.rmap_gaps, + &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT); + if (error) + goto out_bitmap; + + /* + * Now that we've zapped all the old rmapbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservation. + */ + pag->pagf_repair_rmap_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; +out_bitmap: + xagb_bitmap_destroy(&rfg.rmap_gaps); + return error; +} + +/* Set up the filesystem scan components. */ +STATIC int +xrep_rmap_setup_scan( + struct xrep_rmap *rr) +{ + struct xfs_scrub *sc = rr->sc; + char *descr; + int error; + + /* Set up some storage */ + descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); + error = xfarray_create(descr, 0, sizeof(struct xrep_rmap_extent), + &rr->rmap_records); + kfree(descr); + if (error) + return error; + + /* Retry iget every tenth of a second for up to 30 seconds. */ + xchk_iscan_start(sc, 30000, 100, &rr->iscan); + return 0; +} + +/* Tear down scan components. */ +STATIC void +xrep_rmap_teardown( + struct xrep_rmap *rr) +{ + xchk_iscan_teardown(&rr->iscan); + xfarray_destroy(rr->rmap_records); +} + +/* Repair the rmap btree for some AG. */ +int +xrep_rmapbt( + struct xfs_scrub *sc) +{ + struct xrep_rmap *rr = sc->buf; + int error; + + /* Functionality is not yet complete. */ + return xrep_notsupported(sc); + + error = xrep_rmap_setup_scan(rr); + if (error) + return error; + + /* + * Collect rmaps for everything in this AG that isn't space metadata. + * These rmaps won't change even as we try to allocate blocks. + */ + error = xrep_rmap_find_rmaps(rr); + if (error) + goto out_records; + + /* Rebuild the rmap information. */ + error = xrep_rmap_build_new_tree(rr); + if (error) + goto out_records; + + /* Kill the old tree. */ + error = xrep_rmap_remove_old_tree(rr); + if (error) + goto out_records; + +out_records: + xrep_rmap_teardown(rr); + return error; +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 6828e72824fb..2a68b8f483bb 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -278,7 +278,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .setup = xchk_setup_ag_rmapbt, .scrub = xchk_rmapbt, .has = xfs_has_rmapbt, - .repair = xrep_notsupported, + .repair = xrep_rmapbt, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b840f25c03d6..1c97c2ff835e 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1595,7 +1595,6 @@ DEFINE_EVENT(xrep_rmap_class, name, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); -DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); TRACE_EVENT(xrep_abt_found, @@ -1713,6 +1712,38 @@ TRACE_EVENT(xrep_bmap_found, __entry->state) ); +TRACE_EVENT(xrep_rmap_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_rmap_irec *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = rec->rm_startblock; + __entry->len = rec->rm_blockcount; + __entry->owner = rec->rm_owner; + __entry->offset = rec->rm_offset; + __entry->flags = rec->rm_flags; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), -- cgit v1.2.3 From 4787fc802752c9b73b28ff18860c0560bf4337f2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:39 -0800 Subject: xfs: create a shadow rmap btree during rmap repair Create an in-memory btree of rmap records instead of an array. This enables us to do live record collection instead of freezing the fs. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rmap.c | 37 ++++--- fs/xfs/libxfs/xfs_rmap_btree.c | 150 +++++++++++++++++++++++++- fs/xfs/libxfs/xfs_rmap_btree.h | 6 ++ fs/xfs/libxfs/xfs_shared.h | 10 ++ fs/xfs/scrub/repair.c | 18 ++++ fs/xfs/scrub/repair.h | 2 + fs/xfs/scrub/rmap_repair.c | 234 +++++++++++++++++++++++++++++------------ fs/xfs/xfs_stats.c | 3 +- fs/xfs/xfs_stats.h | 1 + 9 files changed, 377 insertions(+), 84 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 83bc84a87421..d9f1b336ec33 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -269,6 +269,16 @@ xfs_rmap_check_irec( return NULL; } +static inline xfs_failaddr_t +xfs_rmap_check_btrec( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec) +{ + if (xfs_btree_is_mem_rmap(cur->bc_ops)) + return xfs_rmap_check_irec(cur->bc_mem.pag, irec); + return xfs_rmap_check_irec(cur->bc_ag.pag, irec); +} + static inline int xfs_rmap_complain_bad_rec( struct xfs_btree_cur *cur, @@ -277,9 +287,13 @@ xfs_rmap_complain_bad_rec( { struct xfs_mount *mp = cur->bc_mp; - xfs_warn(mp, - "Reverse Mapping BTree record corruption in AG %d detected at %pS!", - cur->bc_ag.pag->pag_agno, fa); + if (xfs_btree_is_mem_rmap(cur->bc_ops)) + xfs_warn(mp, + "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa); + else + xfs_warn(mp, + "Reverse Mapping BTree record corruption in AG %d detected at %pS!", + cur->bc_ag.pag->pag_agno, fa); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, @@ -307,7 +321,7 @@ xfs_rmap_get_rec( fa = xfs_rmap_btrec_to_irec(rec, irec); if (!fa) - fa = xfs_rmap_check_irec(cur->bc_ag.pag, irec); + fa = xfs_rmap_check_btrec(cur, irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, irec); @@ -2404,15 +2418,12 @@ xfs_rmap_map_raw( { struct xfs_owner_info oinfo; - oinfo.oi_owner = rmap->rm_owner; - oinfo.oi_offset = rmap->rm_offset; - oinfo.oi_flags = 0; - if (rmap->rm_flags & XFS_RMAP_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + xfs_owner_info_pack(&oinfo, rmap->rm_owner, rmap->rm_offset, + rmap->rm_flags); - if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + if ((rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) || + XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) return xfs_rmap_map(cur, rmap->rm_startblock, rmap->rm_blockcount, rmap->rm_flags & XFS_RMAP_UNWRITTEN, @@ -2442,7 +2453,7 @@ xfs_rmap_query_range_helper( fa = xfs_rmap_btrec_to_irec(rec, &irec); if (!fa) - fa = xfs_rmap_check_irec(cur->bc_ag.pag, &irec); + fa = xfs_rmap_check_btrec(cur, &irec); if (fa) return xfs_rmap_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 815e34e295dd..9e759efa81cc 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -22,6 +22,8 @@ #include "xfs_extent_busy.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" static struct kmem_cache *xfs_rmapbt_cur_cache; @@ -541,6 +543,151 @@ xfs_rmapbt_init_cursor( return cur; } +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline unsigned int +xfs_rmapbt_mem_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64)); +} + +/* + * Validate an in-memory rmap btree block. Callers are allowed to generate an + * in-memory btree even if the ondisk feature is not enabled. + */ +static xfs_failaddr_t +xfs_rmapbt_mem_verify( + struct xfs_buf *bp) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (level >= xfs_rmapbt_maxlevels_ondisk()) + return __this_address; + + maxrecs = xfs_rmapbt_mem_block_maxrecs( + XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +xfs_rmapbt_mem_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = xfs_rmapbt_mem_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = { + .name = "xfs_rmapbt_mem", + .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, + .verify_read = xfs_rmapbt_mem_rw_verify, + .verify_write = xfs_rmapbt_mem_rw_verify, + .verify_struct = xfs_rmapbt_mem_verify, +}; + +const struct xfs_btree_ops xfs_rmapbt_mem_ops = { + .name = "mem_rmap", + .type = XFS_BTREE_TYPE_MEM, + .geom_flags = XFS_BTGEO_OVERLAPPING, + + .rec_len = sizeof(struct xfs_rmap_rec), + /* Overlapping btree; 2 keys per pointer. */ + .key_len = 2 * sizeof(struct xfs_rmap_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = XFS_RMAP_BTREE_REF, + .statoff = XFS_STATS_CALC_INDEX(xs_rmap_mem_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = xfs_rmapbt_init_key_from_rec, + .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = xfs_rmapbt_key_diff, + .buf_ops = &xfs_rmapbt_mem_buf_ops, + .diff_two_keys = xfs_rmapbt_diff_two_keys, + .keys_inorder = xfs_rmapbt_keys_inorder, + .recs_inorder = xfs_rmapbt_recs_inorder, + .keys_contiguous = xfs_rmapbt_keys_contiguous, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +xfs_rmapbt_mem_cursor( + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfbtree *xfbt) +{ + struct xfs_btree_cur *cur; + struct xfs_mount *mp = pag->pag_mount; + + cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops, + xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache); + cur->bc_mem.xfbtree = xfbt; + cur->bc_nlevels = xfbt->nlevels; + + cur->bc_mem.pag = xfs_perag_hold(pag); + return cur; +} + +/* Create an in-memory rmap btree. */ +int +xfs_rmapbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp, + xfs_agnumber_t agno) +{ + xfbt->owner = agno; + return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops); +} + +/* Compute the max possible height for reverse mapping btrees in memory. */ +static unsigned int +xfs_rmapbt_mem_maxlevels(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2; + + /* + * How tall can an in-memory rmap btree become if we filled the entire + * AG with rmap records? + */ + return xfs_btree_compute_maxlevels(minrecs, + XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec)); +} +#else +# define xfs_rmapbt_mem_maxlevels() (0) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + /* * Install a new reverse mapping btree root. Caller is responsible for * invalidating and freeing the old btree blocks. @@ -611,7 +758,8 @@ xfs_rmapbt_maxlevels_ondisk(void) * like if it consumes almost all the blocks in the AG due to maximal * sharing factor. */ - return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS); + return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS), + xfs_rmapbt_mem_maxlevels()); } /* Compute the maximum height of an rmap btree. */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 27536d7e14aa..eb90d89e8086 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -10,6 +10,7 @@ struct xfs_buf; struct xfs_btree_cur; struct xfs_mount; struct xbtree_afakeroot; +struct xfbtree; /* rmaps only exist on crc enabled filesystems */ #define XFS_RMAP_BLOCK_LEN XFS_BTREE_SBLOCK_CRC_LEN @@ -62,4 +63,9 @@ unsigned int xfs_rmapbt_maxlevels_ondisk(void); int __init xfs_rmapbt_init_cur_cache(void); void xfs_rmapbt_destroy_cur_cache(void); +struct xfs_btree_cur *xfs_rmapbt_mem_cursor(struct xfs_perag *pag, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int xfs_rmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp, xfs_agnumber_t agno); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 6b8bc276d461..cab49e7116ec 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -51,6 +51,7 @@ extern const struct xfs_btree_ops xfs_finobt_ops; extern const struct xfs_btree_ops xfs_bmbt_ops; extern const struct xfs_btree_ops xfs_refcountbt_ops; extern const struct xfs_btree_ops xfs_rmapbt_ops; +extern const struct xfs_btree_ops xfs_rmapbt_mem_ops; static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops) { @@ -87,6 +88,15 @@ static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops) return ops == &xfs_rmapbt_ops; } +#ifdef CONFIG_XFS_BTREE_IN_MEM +static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops) +{ + return ops == &xfs_rmapbt_mem_ops; +} +#else +# define xfs_btree_is_mem_rmap(...) (false) +#endif + /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 0c56dafd9ae4..2645abddad78 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -31,12 +31,14 @@ #include "xfs_error.h" #include "xfs_reflink.h" #include "xfs_health.h" +#include "xfs_buf_mem.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/stats.h" +#include "scrub/xfile.h" /* * Attempt to repair some metadata, if the metadata is corrupt and userspace @@ -1147,3 +1149,19 @@ xrep_metadata_inode_forks( return 0; } + +/* + * Set up an in-memory buffer cache so that we can use the xfbtree. Allocating + * a shmem file might take loks, so we cannot be in transaction context. Park + * our resources in the scrub context and let the teardown function take care + * of them at the right time. + */ +int +xrep_setup_xfbtree( + struct xfs_scrub *sc, + const char *descr) +{ + ASSERT(sc->tp == NULL); + + return xmbuf_alloc(sc->mp, descr, &sc->xmbtp); +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index c01e56799bd1..059b51027365 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -81,6 +81,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc); # define xrep_ino_dqattach(sc) (0) #endif /* CONFIG_XFS_QUOTA */ +int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr); + int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, xfs_extnum_t nextents); int xrep_reset_perag_resv(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index 120efb49bbfe..6efca28e461d 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -12,6 +12,8 @@ #include "xfs_defer.h" #include "xfs_btree.h" #include "xfs_btree_staging.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" #include "xfs_bit.h" #include "xfs_log_format.h" #include "xfs_trans.h" @@ -121,33 +123,25 @@ * We use the 'xrep_rmap' prefix for all the rmap functions. */ -/* - * Packed rmap record. The ATTR/BMBT/UNWRITTEN flags are hidden in the upper - * bits of offset, just like the on-disk record. - */ -struct xrep_rmap_extent { - xfs_agblock_t startblock; - xfs_extlen_t blockcount; - uint64_t owner; - uint64_t offset; -} __packed; - /* Context for collecting rmaps */ struct xrep_rmap { /* new rmapbt information */ struct xrep_newbt new_btree; /* rmap records generated from primary metadata */ - struct xfarray *rmap_records; + struct xfbtree rmap_btree; struct xfs_scrub *sc; - /* get_records()'s position in the rmap record array. */ - xfarray_idx_t array_cur; + /* in-memory btree cursor for the xfs_btree_bload iteration */ + struct xfs_btree_cur *mcur; /* inode scan cursor */ struct xchk_iscan iscan; + /* Number of non-freespace records found. */ + unsigned long long nr_records; + /* bnobt/cntbt contribution to btreeblks */ xfs_agblock_t freesp_btblocks; @@ -161,6 +155,14 @@ xrep_setup_ag_rmapbt( struct xfs_scrub *sc) { struct xrep_rmap *rr; + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + if (error) + return error; rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS); if (!rr) @@ -204,11 +206,6 @@ xrep_rmap_stash( uint64_t offset, unsigned int flags) { - struct xrep_rmap_extent rre = { - .startblock = startblock, - .blockcount = blockcount, - .owner = owner, - }; struct xfs_rmap_irec rmap = { .rm_startblock = startblock, .rm_blockcount = blockcount, @@ -217,6 +214,7 @@ xrep_rmap_stash( .rm_flags = flags, }; struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *mcur; int error = 0; if (xchk_should_terminate(sc, &error)) @@ -224,8 +222,17 @@ xrep_rmap_stash( trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); - rre.offset = xfs_rmap_irec_offset_pack(&rmap); - return xfarray_append(rr->rmap_records, &rre); + mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); + error = xfs_rmap_map_raw(mcur, &rmap); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + return xfbtree_trans_commit(&rr->rmap_btree, sc->tp); + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); + return error; } struct xrep_rmap_stash_run { @@ -804,6 +811,24 @@ xrep_rmap_find_log_rmaps( sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0); } +/* Check and count all the records that we gathered. */ +STATIC int +xrep_rmap_check_record( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap *rr = priv; + int error; + + error = xrep_rmap_check_mapping(rr->sc, rec); + if (error) + return error; + + rr->nr_records++; + return 0; +} + /* * Generate all the reverse-mappings for this AG, a list of the old rmapbt * blocks, and the new btreeblks count. Figure out if we have enough free @@ -817,6 +842,7 @@ xrep_rmap_find_rmaps( struct xfs_scrub *sc = rr->sc; struct xchk_ag *sa = &sc->sa; struct xfs_inode *ip; + struct xfs_btree_cur *mcur; int error; /* Find all the per-AG metadata. */ @@ -884,7 +910,29 @@ end_agscan: error = xchk_setup_fs(sc); if (error) return error; - return xchk_perag_drain_and_lock(sc); + error = xchk_perag_drain_and_lock(sc); + if (error) + return error; + + /* + * Now that we have everything locked again, we need to count the + * number of rmap records stashed in the btree. This should reflect + * all actively-owned space in the filesystem. At the same time, check + * all our records before we start building a new btree, which requires + * a bnobt cursor. + */ + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); + sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, + sc->sa.pag); + + rr->nr_records = 0; + error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr); + + xfs_btree_del_cursor(sc->sa.bno_cur, error); + sc->sa.bno_cur = NULL; + xfs_btree_del_cursor(mcur, error); + + return error; } /* Section (II): Reserving space for new rmapbt and setting free space bitmap */ @@ -917,7 +965,6 @@ STATIC int xrep_rmap_try_reserve( struct xrep_rmap *rr, struct xfs_btree_cur *rmap_cur, - uint64_t nr_records, struct xagb_bitmap *freesp_blocks, uint64_t *blocks_reserved, bool *done) @@ -1001,7 +1048,7 @@ xrep_rmap_try_reserve( /* Compute how many blocks we'll need for all the rmaps. */ error = xfs_btree_bload_compute_geometry(rmap_cur, - &rr->new_btree.bload, nr_records + freesp_records); + &rr->new_btree.bload, rr->nr_records + freesp_records); if (error) return error; @@ -1020,16 +1067,13 @@ xrep_rmap_reserve_space( struct xfs_btree_cur *rmap_cur) { struct xagb_bitmap freesp_blocks; /* AGBIT */ - uint64_t nr_records; /* NR */ uint64_t blocks_reserved = 0; bool done = false; int error; - nr_records = xfarray_length(rr->rmap_records); - /* Compute how many blocks we'll need for the rmaps collected so far. */ error = xfs_btree_bload_compute_geometry(rmap_cur, - &rr->new_btree.bload, nr_records); + &rr->new_btree.bload, rr->nr_records); if (error) return error; @@ -1046,8 +1090,8 @@ xrep_rmap_reserve_space( * Finish when we don't need more blocks. */ do { - error = xrep_rmap_try_reserve(rr, rmap_cur, nr_records, - &freesp_blocks, &blocks_reserved, &done); + error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks, + &blocks_reserved, &done); if (error) goto out_bitmap; } while (!done); @@ -1108,28 +1152,25 @@ xrep_rmap_get_records( unsigned int nr_wanted, void *priv) { - struct xrep_rmap_extent rec; - struct xfs_rmap_irec *irec = &cur->bc_rec.r; struct xrep_rmap *rr = priv; union xfs_btree_rec *block_rec; unsigned int loaded; int error; for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { - error = xfarray_load_next(rr->rmap_records, &rr->array_cur, - &rec); + int stat = 0; + + error = xfs_btree_increment(rr->mcur, 0, &stat); if (error) return error; - - irec->rm_startblock = rec.startblock; - irec->rm_blockcount = rec.blockcount; - irec->rm_owner = rec.owner; - if (xfs_rmap_irec_offset_unpack(rec.offset, irec) != NULL) + if (!stat) return -EFSCORRUPTED; - error = xrep_rmap_check_mapping(rr->sc, irec); + error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat); if (error) return error; + if (!stat) + return -EFSCORRUPTED; block_rec = xfs_btree_rec_addr(cur, idx, block); cur->bc_ops->init_rec_from_cur(cur, block_rec); @@ -1189,6 +1230,29 @@ xrep_rmap_alloc_vextent( return xfs_alloc_vextent_near_bno(args, alloc_hint); } + +/* Count the records in this btree. */ +STATIC int +xrep_rmap_count_records( + struct xfs_btree_cur *cur, + unsigned long long *nr) +{ + int running = 1; + int error; + + *nr = 0; + + error = xfs_btree_goto_left_edge(cur); + if (error) + return error; + + while (running && !(error = xfs_btree_increment(cur, 0, &running))) { + if (running) + (*nr)++; + } + + return error; +} /* * Use the collected rmap information to stage a new rmap btree. If this is * successful we'll return with the new btree root information logged to the @@ -1238,6 +1302,17 @@ xrep_rmap_build_new_tree( if (error) goto err_cur; + /* + * Count the rmapbt records again, because the space reservation + * for the rmapbt itself probably added more records to the btree. + */ + rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, + &rr->rmap_btree); + + error = xrep_rmap_count_records(rr->mcur, &rr->nr_records); + if (error) + goto err_mcur; + /* * Due to btree slack factors, it's possible for a new btree to be one * level taller than the old btree. Update the incore btree height so @@ -1246,13 +1321,16 @@ xrep_rmap_build_new_tree( */ pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height; + /* + * Move the cursor to the left edge of the tree so that the first + * increment in ->get_records positions us at the first record. + */ + error = xfs_btree_goto_left_edge(rr->mcur); + if (error) + goto err_level; + /* Add all observed rmap records. */ - rr->array_cur = XFARRAY_CURSOR_INIT; - sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp, - sc->sa.pag); error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr); - xfs_btree_del_cursor(sc->sa.bno_cur, error); - sc->sa.bno_cur = NULL; if (error) goto err_level; @@ -1262,6 +1340,14 @@ xrep_rmap_build_new_tree( */ xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp); xfs_btree_del_cursor(rmap_cur, 0); + xfs_btree_del_cursor(rr->mcur, 0); + rr->mcur = NULL; + + /* + * Now that we've written the new btree to disk, we don't need to keep + * updating the in-memory btree. Abort the scan to stop live updates. + */ + xchk_iscan_abort(&rr->iscan); /* * The newly committed rmap recordset includes mappings for the blocks @@ -1285,6 +1371,8 @@ xrep_rmap_build_new_tree( err_level: pag->pagf_repair_rmap_level = 0; +err_mcur: + xfs_btree_del_cursor(rr->mcur, error); err_cur: xfs_btree_del_cursor(rmap_cur, error); err_newbt: @@ -1312,6 +1400,28 @@ xrep_rmap_find_freesp( rec->ar_blockcount); } +/* Record the free space we find, as part of cleaning out the btree. */ +STATIC int +xrep_rmap_find_gaps( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_rmap_find_gaps *rfg = priv; + int error; + + if (rec->rm_startblock > rfg->next_agbno) { + error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno, + rec->rm_startblock - rfg->next_agbno); + if (error) + return error; + } + + rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + /* * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make * a list of gaps in the rmap records and a list of the extents mentioned in @@ -1328,30 +1438,19 @@ xrep_rmap_remove_old_tree( struct xfs_scrub *sc = rr->sc; struct xfs_agf *agf = sc->sa.agf_bp->b_addr; struct xfs_perag *pag = sc->sa.pag; + struct xfs_btree_cur *mcur; xfs_agblock_t agend; - xfarray_idx_t array_cur; int error; xagb_bitmap_init(&rfg.rmap_gaps); /* Compute free space from the new rmapbt. */ - foreach_xfarray_idx(rr->rmap_records, array_cur) { - struct xrep_rmap_extent rec; - - error = xfarray_load(rr->rmap_records, array_cur, &rec); - if (error) - goto out_bitmap; + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree); - /* Record the free space we find. */ - if (rec.startblock > rfg.next_agbno) { - error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno, - rec.startblock - rfg.next_agbno); - if (error) - goto out_bitmap; - } - rfg.next_agbno = max_t(xfs_agblock_t, rfg.next_agbno, - rec.startblock + rec.blockcount); - } + error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_bitmap; /* Insert a record for space between the last rmap and EOAG. */ agend = be32_to_cpu(agf->agf_length); @@ -1402,14 +1501,11 @@ xrep_rmap_setup_scan( struct xrep_rmap *rr) { struct xfs_scrub *sc = rr->sc; - char *descr; int error; - /* Set up some storage */ - descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); - error = xfarray_create(descr, 0, sizeof(struct xrep_rmap_extent), - &rr->rmap_records); - kfree(descr); + /* Set up in-memory rmap btree */ + error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, + sc->sa.pag->pag_agno); if (error) return error; @@ -1424,7 +1520,7 @@ xrep_rmap_teardown( struct xrep_rmap *rr) { xchk_iscan_teardown(&rr->iscan); - xfarray_destroy(rr->rmap_records); + xfbtree_destroy(&rr->rmap_btree); } /* Repair the rmap btree for some AG. */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 90a77cd3ebad..5c6773628d69 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -50,7 +50,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "ibt2", xfsstats_offset(xs_fibt_2) }, { "fibt2", xfsstats_offset(xs_rmap_2) }, { "rmapbt", xfsstats_offset(xs_refcbt_2) }, - { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)}, + { "refcntbt", xfsstats_offset(xs_rmap_mem_2) }, + { "rmapbt_mem", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 43ffba74f045..3b50419d8bb9 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -125,6 +125,7 @@ struct __xfsstats { uint32_t xs_fibt_2[__XBTS_MAX]; uint32_t xs_rmap_2[__XBTS_MAX]; uint32_t xs_refcbt_2[__XBTS_MAX]; + uint32_t xs_rmap_mem_2[__XBTS_MAX]; uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; -- cgit v1.2.3 From 18a1e644b094a12e5417c386b850aaa85bdca51f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:40 -0800 Subject: xfs: define an in-memory btree for storing refcount bag info during repairs Create a new in-memory btree type so that we can store refcount bag info in a much more memory-efficient and performant format. Recall that the refcount recordset regenerator computes the new recordset from browsing the rmap records. Let's say that the rmap records are: {agbno: 10, length: 40, ...} {agbno: 11, length: 3, ...} {agbno: 12, length: 20, ...} {agbno: 15, length: 1, ...} It is convenient to have a data structure that could quickly tell us the refcount for an arbitrary agbno without wasting memory. An array or a list could do that pretty easily. List suck because of the pointer overhead. xfarrays are a lot more compact, but we want to minimize sparse holes in the xfarray to constrain memory usage. Maintaining any kind of record order isn't needed for correctness, so I created the "rcbag", which is shorthand for an unordered list of (excerpted) reverse mappings. So we add the first rmap to the rcbag, and it looks like: 0: {agbno: 10, length: 40} The refcount for agbno 10 is 1. Then we move on to block 11, so we add the second rmap: 0: {agbno: 10, length: 40} 1: {agbno: 11, length: 3} The refcount for agbno 11 is 2. We move on to block 12, so we add the third: 0: {agbno: 10, length: 40} 1: {agbno: 11, length: 3} 2: {agbno: 12, length: 20} The refcount for agbno 12 and 13 is 3. We move on to block 14, and remove the second rmap: 0: {agbno: 10, length: 40} 1: NULL 2: {agbno: 12, length: 20} The refcount for agbno 14 is 2. We move on to block 15, and add the last rmap. But we don't care where it is and we don't want to expand the array so we put it in slot 1: 0: {agbno: 10, length: 40} 1: {agbno: 15, length: 1} 2: {agbno: 12, length: 20} The refcount for block 15 is 3. Notice how order doesn't matter in this list? That's why repair uses an unordered list, or "bag". The data structure is not a set because it does not guarantee uniqueness. That said, adding and removing specific items is now an O(n) operation because we have no idea where that item might be in the list. Overall, the runtime is O(n^2) which is bad. I realized that I could easily refactor the btree code and reimplement the refcount bag with an xfbtree. Adding and removing is now O(log2 n), so the runtime is at least O(n log2 n), which is much faster. In the end, the rcbag becomes a sorted list, but that's merely a detail of the implementation. The repair code doesn't care. (Note: That horrible xfs_db bmap_inflate command can be used to exercise this sort of rcbag insanity by cranking up refcounts quickly.) Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/rcbag_btree.c | 312 +++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/rcbag_btree.h | 74 +++++++++++ fs/xfs/xfs_stats.c | 3 +- fs/xfs/xfs_stats.h | 1 + 5 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 fs/xfs/scrub/rcbag_btree.c create mode 100644 fs/xfs/scrub/rcbag_btree.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 6de02b2573c3..cca169bd9617 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -198,6 +198,7 @@ xfs-y += $(addprefix scrub/, \ inode_repair.o \ newbt.o \ nlinks_repair.o \ + rcbag_btree.o \ reap.o \ refcount_repair.o \ repair.o \ diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c new file mode 100644 index 000000000000..762960e012a0 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/rcbag_btree.h" +#include "scrub/trace.h" + +static struct kmem_cache *rcbagbt_cur_cache; + +STATIC void +rcbagbt_init_key_from_rec( + union xfs_btree_key *key, + const union xfs_btree_rec *rec) +{ + struct rcbag_key *bag_key = (struct rcbag_key *)key; + const struct rcbag_rec *bag_rec = (const struct rcbag_rec *)rec; + + BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key)); + BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec)); + + bag_key->rbg_startblock = bag_rec->rbg_startblock; + bag_key->rbg_blockcount = bag_rec->rbg_blockcount; +} + +STATIC void +rcbagbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct rcbag_rec *bag_rec = (struct rcbag_rec *)rec; + struct rcbag_rec *bag_irec = (struct rcbag_rec *)&cur->bc_rec; + + bag_rec->rbg_startblock = bag_irec->rbg_startblock; + bag_rec->rbg_blockcount = bag_irec->rbg_blockcount; + bag_rec->rbg_refcount = bag_irec->rbg_refcount; +} + +STATIC int64_t +rcbagbt_key_diff( + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + const struct rcbag_key *kp = (const struct rcbag_key *)key; + + if (kp->rbg_startblock > rec->rbg_startblock) + return 1; + if (kp->rbg_startblock < rec->rbg_startblock) + return -1; + + if (kp->rbg_blockcount > rec->rbg_blockcount) + return 1; + if (kp->rbg_blockcount < rec->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int64_t +rcbagbt_diff_two_keys( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2, + const union xfs_btree_key *mask) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + ASSERT(mask == NULL); + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 1; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return -1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 1; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return -1; + + return 0; +} + +STATIC int +rcbagbt_keys_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) +{ + const struct rcbag_key *kp1 = (const struct rcbag_key *)k1; + const struct rcbag_key *kp2 = (const struct rcbag_key *)k2; + + if (kp1->rbg_startblock > kp2->rbg_startblock) + return 0; + if (kp1->rbg_startblock < kp2->rbg_startblock) + return 1; + + if (kp1->rbg_blockcount > kp2->rbg_blockcount) + return 0; + if (kp1->rbg_blockcount < kp2->rbg_blockcount) + return 1; + + return 0; +} + +STATIC int +rcbagbt_recs_inorder( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) +{ + const struct rcbag_rec *rp1 = (const struct rcbag_rec *)r1; + const struct rcbag_rec *rp2 = (const struct rcbag_rec *)r2; + + if (rp1->rbg_startblock > rp2->rbg_startblock) + return 0; + if (rp1->rbg_startblock < rp2->rbg_startblock) + return 1; + + if (rp1->rbg_blockcount > rp2->rbg_blockcount) + return 0; + if (rp1->rbg_blockcount < rp2->rbg_blockcount) + return 1; + + return 0; +} + +static xfs_failaddr_t +rcbagbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_failaddr_t fa; + unsigned int level; + unsigned int maxrecs; + + if (!xfs_verify_magic(bp, block->bb_magic)) + return __this_address; + + fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN); + if (fa) + return fa; + + level = be16_to_cpu(block->bb_level); + if (level >= rcbagbt_maxlevels_possible()) + return __this_address; + + maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0); + return xfs_btree_memblock_verify(bp, maxrecs); +} + +static void +rcbagbt_rw_verify( + struct xfs_buf *bp) +{ + xfs_failaddr_t fa = rcbagbt_verify(bp); + + if (fa) + xfs_verifier_error(bp, -EFSCORRUPTED, fa); +} + +/* skip crc checks on in-memory btrees to save time */ +static const struct xfs_buf_ops rcbagbt_mem_buf_ops = { + .name = "rcbagbt_mem", + .magic = { 0, cpu_to_be32(RCBAG_MAGIC) }, + .verify_read = rcbagbt_rw_verify, + .verify_write = rcbagbt_rw_verify, + .verify_struct = rcbagbt_verify, +}; + +static const struct xfs_btree_ops rcbagbt_mem_ops = { + .name = "rcbag", + .type = XFS_BTREE_TYPE_MEM, + + .rec_len = sizeof(struct rcbag_rec), + .key_len = sizeof(struct rcbag_key), + .ptr_len = XFS_BTREE_LONG_PTR_LEN, + + .lru_refs = 1, + .statoff = XFS_STATS_CALC_INDEX(xs_rcbag_2), + + .dup_cursor = xfbtree_dup_cursor, + .set_root = xfbtree_set_root, + .alloc_block = xfbtree_alloc_block, + .free_block = xfbtree_free_block, + .get_minrecs = xfbtree_get_minrecs, + .get_maxrecs = xfbtree_get_maxrecs, + .init_key_from_rec = rcbagbt_init_key_from_rec, + .init_rec_from_cur = rcbagbt_init_rec_from_cur, + .init_ptr_from_cur = xfbtree_init_ptr_from_cur, + .key_diff = rcbagbt_key_diff, + .buf_ops = &rcbagbt_mem_buf_ops, + .diff_two_keys = rcbagbt_diff_two_keys, + .keys_inorder = rcbagbt_keys_inorder, + .recs_inorder = rcbagbt_recs_inorder, +}; + +/* Create a cursor for an in-memory btree. */ +struct xfs_btree_cur * +rcbagbt_mem_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfbtree *xfbtree) +{ + struct xfs_btree_cur *cur; + + cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops, + rcbagbt_maxlevels_possible(), rcbagbt_cur_cache); + + cur->bc_mem.xfbtree = xfbtree; + cur->bc_nlevels = xfbtree->nlevels; + return cur; +} + +/* Create an in-memory refcount bag btree. */ +int +rcbagbt_mem_init( + struct xfs_mount *mp, + struct xfbtree *xfbt, + struct xfs_buftarg *btp) +{ + xfbt->owner = 0; + return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops); +} + +/* Calculate number of records in a refcount bag btree block. */ +static inline unsigned int +rcbagbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct rcbag_rec); + return blocklen / + (sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t)); +} + +/* + * Calculate number of records in an refcount bag btree block. + */ +unsigned int +rcbagbt_maxrecs( + struct xfs_mount *mp, + unsigned int blocklen, + bool leaf) +{ + blocklen -= RCBAG_BLOCK_LEN; + return rcbagbt_block_maxrecs(blocklen, leaf); +} + +/* Compute the max possible height for refcount bag btrees. */ +unsigned int +rcbagbt_maxlevels_possible(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_space_to_height(minrecs, ULLONG_MAX); +} + +/* Calculate the refcount bag btree size for some records. */ +unsigned long long +rcbagbt_calc_size( + unsigned long long nr_records) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN; + + minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_calc_size(minrecs, nr_records); +} + +int __init +rcbagbt_init_cur_cache(void) +{ + rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur", + xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()), + 0, 0, NULL); + + if (!rcbagbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +rcbagbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(rcbagbt_cur_cache); + rcbagbt_cur_cache = NULL; +} diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h new file mode 100644 index 000000000000..9202f7199b68 --- /dev/null +++ b/fs/xfs/scrub/rcbag_btree.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_RCBAG_BTREE_H__ +#define __XFS_SCRUB_RCBAG_BTREE_H__ + +#ifdef CONFIG_XFS_BTREE_IN_MEM + +struct xfs_buf; +struct xfs_btree_cur; +struct xfs_mount; + +#define RCBAG_MAGIC 0x74826671 /* 'JRBG' */ + +struct rcbag_key { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; +}; + +struct rcbag_rec { + uint32_t rbg_startblock; + uint32_t rbg_blockcount; + uint64_t rbg_refcount; +}; + +typedef __be64 rcbag_ptr_t; + +/* reflinks only exist on crc enabled filesystems */ +#define RCBAG_BLOCK_LEN XFS_BTREE_LBLOCK_CRC_LEN + +/* + * Record, key, and pointer address macros for btree blocks. + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define RCBAG_REC_ADDR(block, index) \ + ((struct rcbag_rec *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (((index) - 1) * sizeof(struct rcbag_rec)))) + +#define RCBAG_KEY_ADDR(block, index) \ + ((struct rcbag_key *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + ((index) - 1) * sizeof(struct rcbag_key))) + +#define RCBAG_PTR_ADDR(block, index, maxrecs) \ + ((rcbag_ptr_t *) \ + ((char *)(block) + RCBAG_BLOCK_LEN + \ + (maxrecs) * sizeof(struct rcbag_key) + \ + ((index) - 1) * sizeof(rcbag_ptr_t))) + +unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, + bool leaf); + +unsigned long long rcbagbt_calc_size(unsigned long long nr_records); + +unsigned int rcbagbt_maxlevels_possible(void); + +int __init rcbagbt_init_cur_cache(void); +void rcbagbt_destroy_cur_cache(void); + +struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp, + struct xfs_trans *tp, struct xfbtree *xfbtree); +int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, + struct xfs_buftarg *btp); + +#else +# define rcbagbt_init_cur_cache() 0 +# define rcbagbt_destroy_cur_cache() ((void)0) +#endif /* CONFIG_XFS_BTREE_IN_MEM */ + +#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 5c6773628d69..ed97d72caa66 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -51,7 +51,8 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { "fibt2", xfsstats_offset(xs_rmap_2) }, { "rmapbt", xfsstats_offset(xs_refcbt_2) }, { "refcntbt", xfsstats_offset(xs_rmap_mem_2) }, - { "rmapbt_mem", xfsstats_offset(xs_qm_dqreclaims)}, + { "rmapbt_mem", xfsstats_offset(xs_rcbag_2) }, + { "rcbagbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 3b50419d8bb9..a61fb56ed2e6 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -126,6 +126,7 @@ struct __xfsstats { uint32_t xs_rmap_2[__XBTS_MAX]; uint32_t xs_refcbt_2[__XBTS_MAX]; uint32_t xs_rmap_mem_2[__XBTS_MAX]; + uint32_t xs_rcbag_2[__XBTS_MAX]; uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; -- cgit v1.2.3 From 7e1b84b24d257700e417bc9cd724c1efdff653d7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:40 -0800 Subject: xfs: hook live rmap operations during a repair operation Hook the regular rmap code when an rmapbt repair operation is running so that we can unlock the AGF buffer to scan the filesystem and keep the in-memory btree up to date during the scan. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 1 + fs/xfs/libxfs/xfs_ag.h | 3 + fs/xfs/libxfs/xfs_rmap.c | 154 +++++++++++++++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_rmap.h | 29 +++++++++ fs/xfs/scrub/common.c | 3 + fs/xfs/scrub/repair.c | 36 +++++++++++ fs/xfs/scrub/repair.h | 4 ++ fs/xfs/scrub/rmap_repair.c | 145 ++++++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/scrub.c | 4 ++ fs/xfs/scrub/scrub.h | 4 +- fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 47 ++++++++++++++ 12 files changed, 392 insertions(+), 39 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 32d80a76440c..d728709054b2 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -417,6 +417,7 @@ xfs_initialize_perag( init_waitqueue_head(&pag->pag_active_wq); pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; + xfs_hooks_init(&pag->pag_rmap_update_hooks); #endif /* __KERNEL__ */ error = xfs_buf_cache_init(&pag->pag_bcache); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index e019b79dbbe3..35de09a2516c 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -120,6 +120,9 @@ struct xfs_perag { * inconsistencies. */ struct xfs_defer_drain pag_intents_drain; + + /* Hook to feed rmapbt updates to an active online repair. */ + struct xfs_hooks pag_rmap_update_hooks; #endif /* __KERNEL__ */ }; diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index d9f1b336ec33..ef16f6f9cef6 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -821,6 +821,86 @@ out_error: return error; } +#ifdef CONFIG_XFS_LIVE_HOOKS +/* + * Use a static key here to reduce the overhead of rmapbt live updates. If + * the compiler supports jump labels, the static branch will be replaced by a + * nop sled when there are no hook users. Online fsck is currently the only + * caller, so this is a reasonable tradeoff. + * + * Note: Patching the kernel code requires taking the cpu hotplug lock. Other + * parts of the kernel allocate memory with that lock held, which means that + * XFS callers cannot hold any locks that might be used by memory reclaim or + * writeback when calling the static_branch_{inc,dec} functions. + */ +DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_rmap_hooks_switch); + +void +xfs_rmap_hook_disable(void) +{ + xfs_hooks_switch_off(&xfs_rmap_hooks_switch); +} + +void +xfs_rmap_hook_enable(void) +{ + xfs_hooks_switch_on(&xfs_rmap_hooks_switch); +} + +/* Call downstream hooks for a reverse mapping update. */ +static inline void +xfs_rmap_update_hook( + struct xfs_trans *tp, + struct xfs_perag *pag, + enum xfs_rmap_intent_type op, + xfs_agblock_t startblock, + xfs_extlen_t blockcount, + bool unwritten, + const struct xfs_owner_info *oinfo) +{ + if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) { + struct xfs_rmap_update_params p = { + .startblock = startblock, + .blockcount = blockcount, + .unwritten = unwritten, + .oinfo = *oinfo, /* struct copy */ + }; + + if (pag) + xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p); + } +} + +/* Call the specified function during a reverse mapping update. */ +int +xfs_rmap_hook_add( + struct xfs_perag *pag, + struct xfs_rmap_hook *hook) +{ + return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook); +} + +/* Stop calling the specified function during a reverse mapping update. */ +void +xfs_rmap_hook_del( + struct xfs_perag *pag, + struct xfs_rmap_hook *hook) +{ + xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook); +} + +/* Configure rmap update hook functions. */ +void +xfs_rmap_hook_setup( + struct xfs_rmap_hook *hook, + notifier_fn_t mod_fn) +{ + xfs_hook_setup(&hook->rmap_hook, mod_fn); +} +#else +# define xfs_rmap_update_hook(t, p, o, s, b, u, oi) do { } while (0) +#endif /* CONFIG_XFS_LIVE_HOOKS */ + /* * Remove a reference to an extent in the rmap btree. */ @@ -841,7 +921,7 @@ xfs_rmap_free( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); - + xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo); error = xfs_rmap_unmap(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -1093,6 +1173,7 @@ xfs_rmap_alloc( return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); + xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo); error = xfs_rmap_map(cur, bno, len, false, oinfo); xfs_btree_del_cursor(cur, error); @@ -2508,6 +2589,38 @@ xfs_rmap_finish_one_cleanup( xfs_trans_brelse(tp, agbp); } +/* Commit an rmap operation into the ondisk tree. */ +int +__xfs_rmap_finish_intent( + struct xfs_btree_cur *rcur, + enum xfs_rmap_intent_type op, + xfs_agblock_t bno, + xfs_extlen_t len, + const struct xfs_owner_info *oinfo, + bool unwritten) +{ + switch (op) { + case XFS_RMAP_ALLOC: + case XFS_RMAP_MAP: + return xfs_rmap_map(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_MAP_SHARED: + return xfs_rmap_map_shared(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_FREE: + case XFS_RMAP_UNMAP: + return xfs_rmap_unmap(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_UNMAP_SHARED: + return xfs_rmap_unmap_shared(rcur, bno, len, unwritten, oinfo); + case XFS_RMAP_CONVERT: + return xfs_rmap_convert(rcur, bno, len, !unwritten, oinfo); + case XFS_RMAP_CONVERT_SHARED: + return xfs_rmap_convert_shared(rcur, bno, len, !unwritten, + oinfo); + default: + ASSERT(0); + return -EFSCORRUPTED; + } +} + /* * Process one of the deferred rmap operations. We pass back the * btree cursor to maintain our lock on the rmapbt between calls. @@ -2574,39 +2687,14 @@ xfs_rmap_finish_one( unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN; bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock); - switch (ri->ri_type) { - case XFS_RMAP_ALLOC: - case XFS_RMAP_MAP: - error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount, - unwritten, &oinfo); - break; - case XFS_RMAP_MAP_SHARED: - error = xfs_rmap_map_shared(rcur, bno, - ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_FREE: - case XFS_RMAP_UNMAP: - error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount, - unwritten, &oinfo); - break; - case XFS_RMAP_UNMAP_SHARED: - error = xfs_rmap_unmap_shared(rcur, bno, - ri->ri_bmap.br_blockcount, unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT: - error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount, - !unwritten, &oinfo); - break; - case XFS_RMAP_CONVERT_SHARED: - error = xfs_rmap_convert_shared(rcur, bno, - ri->ri_bmap.br_blockcount, !unwritten, &oinfo); - break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - } + error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno, + ri->ri_bmap.br_blockcount, &oinfo, unwritten); + if (error) + return error; - return error; + xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno, + ri->ri_bmap.br_blockcount, unwritten, &oinfo); + return 0; } /* diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 58c67896d12c..9d01fe689497 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -186,6 +186,10 @@ void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp, struct xfs_btree_cur *rcur, int error); int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur); +int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur, + enum xfs_rmap_intent_type op, xfs_agblock_t bno, + xfs_extlen_t len, const struct xfs_owner_info *oinfo, + bool unwritten); int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, @@ -235,4 +239,29 @@ extern struct kmem_cache *xfs_rmap_intent_cache; int __init xfs_rmap_intent_init_cache(void); void xfs_rmap_intent_destroy_cache(void); +/* + * Parameters for tracking reverse mapping changes. The hook function arg + * parameter is enum xfs_rmap_intent_type, and the rest is below. + */ +struct xfs_rmap_update_params { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; + struct xfs_owner_info oinfo; + bool unwritten; +}; + +#ifdef CONFIG_XFS_LIVE_HOOKS + +struct xfs_rmap_hook { + struct xfs_hook rmap_hook; +}; + +void xfs_rmap_hook_disable(void); +void xfs_rmap_hook_enable(void); + +int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook); +void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn); +#endif + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index dea883632b2d..abff79a77c72 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1309,6 +1309,9 @@ xchk_fsgates_enable( if (scrub_fsgates & XCHK_FSGATES_DIRENTS) xfs_dir_hook_enable(); + if (scrub_fsgates & XCHK_FSGATES_RMAP) + xfs_rmap_hook_enable(); + sc->flags |= scrub_fsgates; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 2645abddad78..f43dce771cdd 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -1165,3 +1165,39 @@ xrep_setup_xfbtree( return xmbuf_alloc(sc->mp, descr, &sc->xmbtp); } + +/* + * Create a dummy transaction for use in a live update hook function. This + * function MUST NOT be called from regular repair code because the current + * process' transaction is saved via the cookie. + */ +int +xrep_trans_alloc_hook_dummy( + struct xfs_mount *mp, + void **cookiep, + struct xfs_trans **tpp) +{ + int error; + + *cookiep = current->journal_info; + current->journal_info = NULL; + + error = xfs_trans_alloc_empty(mp, tpp); + if (!error) + return 0; + + current->journal_info = *cookiep; + *cookiep = NULL; + return error; +} + +/* Cancel a dummy transaction used by a live update hook function. */ +void +xrep_trans_cancel_hook_dummy( + void **cookiep, + struct xfs_trans *tp) +{ + xfs_trans_cancel(tp); + current->journal_info = *cookiep; + *cookiep = NULL; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 059b51027365..dd1c89e8714c 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -140,6 +140,10 @@ int xrep_quotacheck(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); +int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep, + struct xfs_trans **tpp); +void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp); + #else #define xrep_ino_dqattach(sc) (0) diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c index 6efca28e461d..e8e07b683eab 100644 --- a/fs/xfs/scrub/rmap_repair.c +++ b/fs/xfs/scrub/rmap_repair.c @@ -128,6 +128,9 @@ struct xrep_rmap { /* new rmapbt information */ struct xrep_newbt new_btree; + /* lock for the xfbtree and xfile */ + struct mutex lock; + /* rmap records generated from primary metadata */ struct xfbtree rmap_btree; @@ -136,6 +139,9 @@ struct xrep_rmap { /* in-memory btree cursor for the xfs_btree_bload iteration */ struct xfs_btree_cur *mcur; + /* Hooks into rmap update code. */ + struct xfs_rmap_hook rhook; + /* inode scan cursor */ struct xchk_iscan iscan; @@ -158,6 +164,8 @@ xrep_setup_ag_rmapbt( char *descr; int error; + xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP); + descr = xchk_xfile_ag_descr(sc, "reverse mapping records"); error = xrep_setup_xfbtree(sc, descr); kfree(descr); @@ -220,18 +228,30 @@ xrep_rmap_stash( if (xchk_should_terminate(sc, &error)) return error; + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap); + mutex_lock(&rr->lock); mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree); error = xfs_rmap_map_raw(mcur, &rmap); xfs_btree_del_cursor(mcur, error); if (error) goto out_cancel; - return xfbtree_trans_commit(&rr->rmap_btree, sc->tp); + error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp); + if (error) + goto out_abort; + + mutex_unlock(&rr->lock); + return 0; out_cancel: xfbtree_trans_cancel(&rr->rmap_btree, sc->tp); +out_abort: + xchk_iscan_abort(&rr->iscan); + mutex_unlock(&rr->lock); return error; } @@ -914,6 +934,13 @@ end_agscan: if (error) return error; + /* + * If a hook failed to update the in-memory btree, we lack the data to + * continue the repair. + */ + if (xchk_iscan_aborted(&rr->iscan)) + return -EFSCORRUPTED; + /* * Now that we have everything locked again, we need to count the * number of rmap records stashed in the btree. This should reflect @@ -1495,6 +1522,91 @@ out_bitmap: return error; } +static inline bool +xrep_rmapbt_want_live_update( + struct xchk_iscan *iscan, + const struct xfs_owner_info *oi) +{ + if (xchk_iscan_aborted(iscan)) + return false; + + /* + * Before unlocking the AG header to perform the inode scan, we + * recorded reverse mappings for all AG metadata except for the OWN_AG + * metadata. IOWs, the in-memory btree knows about the AG headers, the + * two inode btrees, the CoW staging extents, and the refcount btrees. + * For these types of metadata, we need to record the live updates in + * the in-memory rmap btree. + * + * However, we do not scan the free space btrees or the AGFL until we + * have re-locked the AGF and are ready to reserve space for the new + * rmap btree, so we do not want live updates for OWN_AG metadata. + */ + if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner)) + return oi->oi_owner != XFS_RMAP_OWN_AG; + + /* Ignore updates to files that the scanner hasn't visited yet. */ + return xchk_iscan_want_live_update(iscan, oi->oi_owner); +} + +/* + * Apply a rmapbt update from the regular filesystem into our shadow btree. + * We're running from the thread that owns the AGF buffer and is generating + * the update, so we must be careful about which parts of the struct xrep_rmap + * that we change. + */ +static int +xrep_rmapbt_live_update( + struct notifier_block *nb, + unsigned long action, + void *data) +{ + struct xfs_rmap_update_params *p = data; + struct xrep_rmap *rr; + struct xfs_mount *mp; + struct xfs_btree_cur *mcur; + struct xfs_trans *tp; + void *txcookie; + int error; + + rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb); + mp = rr->sc->mp; + + if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo)) + goto out_unlock; + + trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p); + + error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp); + if (error) + goto out_abort; + + mutex_lock(&rr->lock); + mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree); + error = __xfs_rmap_finish_intent(mcur, action, p->startblock, + p->blockcount, &p->oinfo, p->unwritten); + xfs_btree_del_cursor(mcur, error); + if (error) + goto out_cancel; + + error = xfbtree_trans_commit(&rr->rmap_btree, tp); + if (error) + goto out_cancel; + + xrep_trans_cancel_hook_dummy(&txcookie, tp); + mutex_unlock(&rr->lock); + return NOTIFY_DONE; + +out_cancel: + xfbtree_trans_cancel(&rr->rmap_btree, tp); + xrep_trans_cancel_hook_dummy(&txcookie, tp); +out_abort: + mutex_unlock(&rr->lock); + xchk_iscan_abort(&rr->iscan); +out_unlock: + return NOTIFY_DONE; +} + /* Set up the filesystem scan components. */ STATIC int xrep_rmap_setup_scan( @@ -1503,15 +1615,36 @@ xrep_rmap_setup_scan( struct xfs_scrub *sc = rr->sc; int error; + mutex_init(&rr->lock); + /* Set up in-memory rmap btree */ error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp, sc->sa.pag->pag_agno); if (error) - return error; + goto out_mutex; /* Retry iget every tenth of a second for up to 30 seconds. */ xchk_iscan_start(sc, 30000, 100, &rr->iscan); + + /* + * Hook into live rmap operations so that we can update our in-memory + * btree to reflect live changes on the filesystem. Since we drop the + * AGF buffer to scan all the inodes, we need this piece to avoid + * installing a stale btree. + */ + ASSERT(sc->flags & XCHK_FSGATES_RMAP); + xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update); + error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook); + if (error) + goto out_iscan; return 0; + +out_iscan: + xchk_iscan_teardown(&rr->iscan); + xfbtree_destroy(&rr->rmap_btree); +out_mutex: + mutex_destroy(&rr->lock); + return error; } /* Tear down scan components. */ @@ -1519,8 +1652,13 @@ STATIC void xrep_rmap_teardown( struct xrep_rmap *rr) { + struct xfs_scrub *sc = rr->sc; + + xchk_iscan_abort(&rr->iscan); + xfs_rmap_hook_del(sc->sa.pag, &rr->rhook); xchk_iscan_teardown(&rr->iscan); xfbtree_destroy(&rr->rmap_btree); + mutex_destroy(&rr->lock); } /* Repair the rmap btree for some AG. */ @@ -1531,9 +1669,6 @@ xrep_rmapbt( struct xrep_rmap *rr = sc->buf; int error; - /* Functionality is not yet complete. */ - return xrep_notsupported(sc); - error = xrep_rmap_setup_scan(rr); if (error) return error; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 2a68b8f483bb..20fac9723c08 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -16,6 +16,7 @@ #include "xfs_qm.h" #include "xfs_scrub.h" #include "xfs_buf_mem.h" +#include "xfs_rmap.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -164,6 +165,9 @@ xchk_fsgates_disable( if (sc->flags & XCHK_FSGATES_DIRENTS) xfs_dir_hook_disable(); + if (sc->flags & XCHK_FSGATES_RMAP) + xfs_rmap_hook_disable(); + sc->flags &= ~XCHK_FSGATES_ALL; } diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1247284c17a0..9ad65b604fe1 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -126,6 +126,7 @@ struct xfs_scrub { #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XCHK_FSGATES_QUOTA (1U << 4) /* quota live update enabled */ #define XCHK_FSGATES_DIRENTS (1U << 5) /* directory live update enabled */ +#define XCHK_FSGATES_RMAP (1U << 6) /* rmapbt live update enabled */ #define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ @@ -137,7 +138,8 @@ struct xfs_scrub { */ #define XCHK_FSGATES_ALL (XCHK_FSGATES_DRAIN | \ XCHK_FSGATES_QUOTA | \ - XCHK_FSGATES_DIRENTS) + XCHK_FSGATES_DIRENTS | \ + XCHK_FSGATES_RMAP) /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index dc8a331f4b02..3dd281d6d185 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -18,6 +18,7 @@ #include "xfs_quota_defs.h" #include "xfs_da_format.h" #include "xfs_dir2.h" +#include "xfs_rmap.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 1c97c2ff835e..5b294be52c55 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -25,6 +25,7 @@ struct xchk_dqiter; struct xchk_iscan; struct xchk_nlink; struct xchk_fscounters; +struct xfs_rmap_update_params; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -112,9 +113,19 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY); { XCHK_NEED_DRAIN, "need_drain" }, \ { XCHK_FSGATES_QUOTA, "fsgates_quota" }, \ { XCHK_FSGATES_DIRENTS, "fsgates_dirents" }, \ + { XCHK_FSGATES_RMAP, "fsgates_rmap" }, \ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } +TRACE_DEFINE_ENUM(XFS_RMAP_MAP); +TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP); +TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT); +TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED); +TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC); +TRACE_DEFINE_ENUM(XFS_RMAP_FREE); + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -2226,6 +2237,42 @@ DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot); DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode); DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode); +TRACE_EVENT(xrep_rmap_live_update, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op, + const struct xfs_rmap_update_params *p), + TP_ARGS(mp, agno, op, p), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(unsigned int, op) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->op = op; + __entry->agbno = p->startblock; + __entry->len = p->blockcount; + xfs_owner_info_unpack(&p->oinfo, &__entry->owner, + &__entry->offset, &__entry->flags); + if (p->unwritten) + __entry->flags |= XFS_RMAP_UNWRITTEN; + ), + TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->op, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ -- cgit v1.2.3 From 7a2192ac109989e5e077271ca1876104acce731e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:41 -0800 Subject: xfs: create refcount bag structure for btree repairs Create a bag structure for refcount information that uses the refcount bag btree defined in the previous patch. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/rcbag.c | 307 +++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/rcbag.h | 28 +++++ fs/xfs/scrub/rcbag_btree.c | 58 +++++++++ fs/xfs/scrub/rcbag_btree.h | 7 ++ 5 files changed, 401 insertions(+) create mode 100644 fs/xfs/scrub/rcbag.c create mode 100644 fs/xfs/scrub/rcbag.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index cca169bd9617..76674ad5833e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -199,6 +199,7 @@ xfs-y += $(addprefix scrub/, \ newbt.o \ nlinks_repair.o \ rcbag_btree.o \ + rcbag.o \ reap.o \ refcount_repair.o \ repair.o \ diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c new file mode 100644 index 000000000000..e1e52bc20713 --- /dev/null +++ b/fs/xfs/scrub/rcbag.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_buf_mem.h" +#include "xfs_btree_mem.h" +#include "xfs_error.h" +#include "scrub/scrub.h" +#include "scrub/rcbag_btree.h" +#include "scrub/rcbag.h" +#include "scrub/trace.h" + +struct rcbag { + struct xfs_mount *mp; + struct xfbtree xfbtree; + uint64_t nr_items; +}; + +int +rcbag_init( + struct xfs_mount *mp, + struct xfs_buftarg *btp, + struct rcbag **bagp) +{ + struct rcbag *bag; + int error; + + bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS); + if (!bag) + return -ENOMEM; + + bag->nr_items = 0; + bag->mp = mp; + + error = rcbagbt_mem_init(mp, &bag->xfbtree, btp); + if (error) + goto out_bag; + + *bagp = bag; + return 0; + +out_bag: + kfree(bag); + return error; +} + +void +rcbag_free( + struct rcbag **bagp) +{ + struct rcbag *bag = *bagp; + + xfbtree_destroy(&bag->xfbtree); + kfree(bag); + *bagp = NULL; +} + +/* Track an rmap in the refcount bag. */ +int +rcbag_add( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = rcbagbt_lookup_eq(cur, rmap, &has); + if (error) + goto out_cur; + + if (has) { + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bagrec.rbg_refcount++; + error = rcbagbt_update(cur, &bagrec); + if (error) + goto out_cur; + } else { + bagrec.rbg_startblock = rmap->rm_startblock; + bagrec.rbg_blockcount = rmap->rm_blockcount; + bagrec.rbg_refcount = 1; + + error = rcbagbt_insert(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + } + + xfs_btree_del_cursor(cur, 0); + + error = xfbtree_trans_commit(&bag->xfbtree, tp); + if (error) + return error; + + bag->nr_items++; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Return the number of records in the bag. */ +uint64_t +rcbag_count( + const struct rcbag *rcbag) +{ + return rcbag->nr_items; +} + +static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r) +{ + return r->rbg_startblock + r->rbg_blockcount; +} + +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +int +rcbag_next_edge( + struct rcbag *bag, + struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, + bool next_valid, + uint32_t *next_bnop) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + uint32_t next_bno = NULLAGBLOCK; + int has; + int error; + + if (next_valid) + next_bno = next_rmap->rm_startblock; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec)); + } + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (next_bno == NULLAGBLOCK) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_btree_del_cursor(cur, 0); + + *next_bnop = next_bno; + return 0; + +out_cur: + xfs_btree_del_cursor(cur, error); + return error; +} + +/* Pop all refcount bag records that end at next_bno */ +int +rcbag_remove_ending_at( + struct rcbag *bag, + struct xfs_trans *tp, + uint32_t next_bno) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + int has; + int error; + + /* go to the right edge of the tree */ + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_decrement(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + if (rcbag_rec_next_bno(&bagrec) != next_bno) + continue; + + error = xfs_btree_delete(cur, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + bag->nr_items -= bagrec.rbg_refcount; + } + + xfs_btree_del_cursor(cur, 0); + return xfbtree_trans_commit(&bag->xfbtree, tp); +out_cur: + xfs_btree_del_cursor(cur, error); + xfbtree_trans_cancel(&bag->xfbtree, tp); + return error; +} + +/* Dump the rcbag. */ +void +rcbag_dump( + struct rcbag *bag, + struct xfs_trans *tp) +{ + struct rcbag_rec bagrec; + struct xfs_mount *mp = bag->mp; + struct xfs_btree_cur *cur; + unsigned long long nr = 0; + int has; + int error; + + cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree); + error = xfs_btree_goto_left_edge(cur); + if (error) + goto out_cur; + + while (true) { + error = xfs_btree_increment(cur, 0, &has); + if (error) + goto out_cur; + if (!has) + break; + + error = rcbagbt_get_rec(cur, &bagrec, &has); + if (error) + goto out_cur; + if (!has) { + error = -EFSCORRUPTED; + goto out_cur; + } + + xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n", + nr++, + (unsigned int)bagrec.rbg_startblock, + (unsigned int)bagrec.rbg_blockcount, + (unsigned long long)bagrec.rbg_refcount); + } + +out_cur: + xfs_btree_del_cursor(cur, error); +} diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h new file mode 100644 index 000000000000..e29ef788ba72 --- /dev/null +++ b/fs/xfs/scrub/rcbag.h @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2022-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_RCBAG_H__ +#define __XFS_SCRUB_RCBAG_H__ + +struct xfs_mount; +struct rcbag; +struct xfs_buftarg; + +int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp, + struct rcbag **bagp); +void rcbag_free(struct rcbag **bagp); +int rcbag_add(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *rmap); +uint64_t rcbag_count(const struct rcbag *bag); + +int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp, + const struct xfs_rmap_irec *next_rmap, bool next_valid, + uint32_t *next_bnop); +int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp, + uint32_t next_bno); + +void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp); + +#endif /* __XFS_SCRUB_RCBAG_H__ */ diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c index 762960e012a0..709356dc6256 100644 --- a/fs/xfs/scrub/rcbag_btree.c +++ b/fs/xfs/scrub/rcbag_btree.c @@ -310,3 +310,61 @@ rcbagbt_destroy_cur_cache(void) kmem_cache_destroy(rcbagbt_cur_cache); rcbagbt_cur_cache = NULL; } + +/* Look up the refcount bag record corresponding to this reverse mapping. */ +int +rcbagbt_lookup_eq( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, + int *success) +{ + struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec; + + rec->rbg_startblock = rmap->rm_startblock; + rec->rbg_blockcount = rmap->rm_blockcount; + + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success); +} + +/* Get the data from the pointed-to record. */ +int +rcbagbt_get_rec( + struct xfs_btree_cur *cur, + struct rcbag_rec *rec, + int *has) +{ + union xfs_btree_rec *btrec; + int error; + + error = xfs_btree_get_rec(cur, &btrec, has); + if (error || !(*has)) + return error; + + memcpy(rec, btrec, sizeof(struct rcbag_rec)); + return 0; +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_update( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec) +{ + union xfs_btree_rec btrec; + + memcpy(&btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_update(cur, &btrec); +} + +/* Update the record referred to by cur to the value given. */ +int +rcbagbt_insert( + struct xfs_btree_cur *cur, + const struct rcbag_rec *rec, + int *success) +{ + struct rcbag_rec *btrec = (struct rcbag_rec *)&cur->bc_rec; + + memcpy(btrec, rec, sizeof(struct rcbag_rec)); + return xfs_btree_insert(cur, success); +} diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h index 9202f7199b68..03cadb032552 100644 --- a/fs/xfs/scrub/rcbag_btree.h +++ b/fs/xfs/scrub/rcbag_btree.h @@ -66,6 +66,13 @@ struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp, int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, struct xfs_buftarg *btp); +int rcbagbt_lookup_eq(struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rmap, int *success); +int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has); +int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec); +int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec, + int *success); + #else # define rcbagbt_init_cur_cache() 0 # define rcbagbt_destroy_cur_cache() ((void)0) -- cgit v1.2.3 From 7fbaab57a80f1639add1c7d02adeb9d17bd50206 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:42 -0800 Subject: xfs: port refcount repair to the new refcount bag structure Port the refcount record generating code to use the new refcount bag data structure. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/refcount.c | 12 +++ fs/xfs/scrub/refcount_repair.c | 164 +++++++++++++++-------------------------- fs/xfs/scrub/repair.h | 2 + fs/xfs/xfs_super.c | 10 ++- 4 files changed, 81 insertions(+), 107 deletions(-) diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index bf22f245bbfa..d0c7d4a29c0f 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -7,8 +7,10 @@ #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" +#include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_ag.h" #include "xfs_btree.h" #include "xfs_rmap.h" @@ -17,6 +19,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* * Set us up to scrub reference count btrees. @@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt( { if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); + + if (xchk_could_repair(sc)) { + int error; + + error = xrep_setup_ag_refcountbt(sc); + if (error) + return error; + } + return xchk_setup_ag_btree(sc, false); } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c index 8240c993061b..a00d7ce7ae5b 100644 --- a/fs/xfs/scrub/refcount_repair.c +++ b/fs/xfs/scrub/refcount_repair.c @@ -38,6 +38,7 @@ #include "scrub/xfarray.h" #include "scrub/newbt.h" #include "scrub/reap.h" +#include "scrub/rcbag.h" /* * Rebuilding the Reference Count Btree @@ -98,12 +99,6 @@ * insert all the records. */ -/* The only parts of the rmap that we care about for computing refcounts. */ -struct xrep_refc_rmap { - xfs_agblock_t startblock; - xfs_extlen_t blockcount; -} __packed; - struct xrep_refc { /* refcount extents */ struct xfarray *refcount_records; @@ -123,6 +118,20 @@ struct xrep_refc { xfs_extlen_t btblocks; }; +/* Set us up to repair refcount btrees. */ +int +xrep_setup_ag_refcountbt( + struct xfs_scrub *sc) +{ + char *descr; + int error; + + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xrep_setup_xfbtree(sc, descr); + kfree(descr); + return error; +} + /* Check for any obvious conflicts with this shared/CoW staging extent. */ STATIC int xrep_refc_check_ext( @@ -224,10 +233,9 @@ xrep_refc_rmap_shareable( STATIC int xrep_refc_walk_rmaps( struct xrep_refc *rr, - struct xrep_refc_rmap *rrm, + struct xfs_rmap_irec *rmap, bool *have_rec) { - struct xfs_rmap_irec rmap; struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; struct xfs_mount *mp = cur->bc_mp; int have_gt; @@ -251,7 +259,7 @@ xrep_refc_walk_rmaps( if (!have_gt) return 0; - error = xfs_rmap_get_rec(cur, &rmap, &have_gt); + error = xfs_rmap_get_rec(cur, rmap, &have_gt); if (error) return error; if (XFS_IS_CORRUPT(mp, !have_gt)) { @@ -259,23 +267,22 @@ xrep_refc_walk_rmaps( return -EFSCORRUPTED; } - if (rmap.rm_owner == XFS_RMAP_OWN_COW) { - error = xrep_refc_stash_cow(rr, rmap.rm_startblock, - rmap.rm_blockcount); + if (rmap->rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap->rm_startblock, + rmap->rm_blockcount); if (error) return error; - } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) { + } else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) { /* refcountbt block, dump it when we're done. */ - rr->btblocks += rmap.rm_blockcount; + rr->btblocks += rmap->rm_blockcount; error = xagb_bitmap_set(&rr->old_refcountbt_blocks, - rmap.rm_startblock, rmap.rm_blockcount); + rmap->rm_startblock, + rmap->rm_blockcount); if (error) return error; } - } while (!xrep_refc_rmap_shareable(mp, &rmap)); + } while (!xrep_refc_rmap_shareable(mp, rmap)); - rrm->startblock = rmap.rm_startblock; - rrm->blockcount = rmap.rm_blockcount; *have_rec = true; return 0; } @@ -357,45 +364,6 @@ xrep_refc_sort_records( return error; } -#define RRM_NEXT(r) ((r).startblock + (r).blockcount) -/* - * Find the next block where the refcount changes, given the next rmap we - * looked at and the ones we're already tracking. - */ -static inline int -xrep_refc_next_edge( - struct xfarray *rmap_bag, - struct xrep_refc_rmap *next_rrm, - bool next_valid, - xfs_agblock_t *nbnop) -{ - struct xrep_refc_rmap rrm; - xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; - xfs_agblock_t nbno = NULLAGBLOCK; - int error; - - if (next_valid) - nbno = next_rrm->startblock; - - while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1) - nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm)); - - if (error) - return error; - - /* - * We should have found /something/ because either next_rrm is the next - * interesting rmap to look at after emitting this refcount extent, or - * there are other rmaps in rmap_bag contributing to the current - * sharing count. But if something is seriously wrong, bail out. - */ - if (nbno == NULLAGBLOCK) - return -EFSCORRUPTED; - - *nbnop = nbno; - return 0; -} - /* * Walk forward through the rmap btree to collect all rmaps starting at * @bno in @rmap_bag. These represent the file(s) that share ownership of @@ -405,22 +373,21 @@ xrep_refc_next_edge( static int xrep_refc_push_rmaps_at( struct xrep_refc *rr, - struct xfarray *rmap_bag, + struct rcbag *rcstack, xfs_agblock_t bno, - struct xrep_refc_rmap *rrm, - bool *have, - uint64_t *stack_sz) + struct xfs_rmap_irec *rmap, + bool *have) { struct xfs_scrub *sc = rr->sc; int have_gt; int error; - while (*have && rrm->startblock == bno) { - error = xfarray_store_anywhere(rmap_bag, rrm); + while (*have && rmap->rm_startblock == bno) { + error = rcbag_add(rcstack, rr->sc->tp, rmap); if (error) return error; - (*stack_sz)++; - error = xrep_refc_walk_rmaps(rr, rrm, have); + + error = xrep_refc_walk_rmaps(rr, rmap, have); if (error) return error; } @@ -441,12 +408,9 @@ STATIC int xrep_refc_find_refcounts( struct xrep_refc *rr) { - struct xrep_refc_rmap rrm; struct xfs_scrub *sc = rr->sc; - struct xfarray *rmap_bag; - char *descr; - uint64_t old_stack_sz; - uint64_t stack_sz = 0; + struct rcbag *rcstack; + uint64_t old_stack_height; xfs_agblock_t sbno; xfs_agblock_t cbno; xfs_agblock_t nbno; @@ -456,14 +420,11 @@ xrep_refc_find_refcounts( xrep_ag_btcur_init(sc, &sc->sa); /* - * Set up a sparse array to store all the rmap records that we're - * tracking to generate a reference count record. If this exceeds + * Set up a bag to store all the rmap records that we're tracking to + * generate a reference count record. If the size of the bag exceeds * MAXREFCOUNT, we clamp rc_refcount. */ - descr = xchk_xfile_ag_descr(sc, "rmap record bag"); - error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap), - &rmap_bag); - kfree(descr); + error = rcbag_init(sc->mp, sc->xmbtp, &rcstack); if (error) goto out_cur; @@ -474,62 +435,54 @@ xrep_refc_find_refcounts( /* Process reverse mappings into refcount data. */ while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + struct xfs_rmap_irec rmap; + /* Push all rmaps with pblk == sbno onto the stack */ - error = xrep_refc_walk_rmaps(rr, &rrm, &have); + error = xrep_refc_walk_rmaps(rr, &rmap, &have); if (error) goto out_bag; if (!have) break; - sbno = cbno = rrm.startblock; - error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno, - &rrm, &have, &stack_sz); + sbno = cbno = rmap.rm_startblock; + error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap, + &have); if (error) goto out_bag; /* Set nbno to the bno of the next refcount change */ - error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno); + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); if (error) goto out_bag; ASSERT(nbno > sbno); - old_stack_sz = stack_sz; + old_stack_height = rcbag_count(rcstack); /* While stack isn't empty... */ - while (stack_sz) { - xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; - + while (rcbag_count(rcstack) > 0) { /* Pop all rmaps that end at nbno */ - while ((error = xfarray_iter(rmap_bag, &array_cur, - &rrm)) == 1) { - if (RRM_NEXT(rrm) != nbno) - continue; - error = xfarray_unset(rmap_bag, array_cur - 1); - if (error) - goto out_bag; - stack_sz--; - } + error = rcbag_remove_ending_at(rcstack, sc->tp, nbno); if (error) goto out_bag; /* Push array items that start at nbno */ - error = xrep_refc_walk_rmaps(rr, &rrm, &have); + error = xrep_refc_walk_rmaps(rr, &rmap, &have); if (error) goto out_bag; if (have) { - error = xrep_refc_push_rmaps_at(rr, rmap_bag, - nbno, &rrm, &have, &stack_sz); + error = xrep_refc_push_rmaps_at(rr, rcstack, + nbno, &rmap, &have); if (error) goto out_bag; } /* Emit refcount if necessary */ ASSERT(nbno > cbno); - if (stack_sz != old_stack_sz) { - if (old_stack_sz > 1) { + if (rcbag_count(rcstack) != old_stack_height) { + if (old_stack_height > 1) { error = xrep_refc_stash(rr, XFS_REFC_DOMAIN_SHARED, cbno, nbno - cbno, - old_stack_sz); + old_stack_height); if (error) goto out_bag; } @@ -537,13 +490,13 @@ xrep_refc_find_refcounts( } /* Stack empty, go find the next rmap */ - if (stack_sz == 0) + if (rcbag_count(rcstack) == 0) break; - old_stack_sz = stack_sz; + old_stack_height = rcbag_count(rcstack); sbno = nbno; /* Set nbno to the bno of the next refcount change */ - error = xrep_refc_next_edge(rmap_bag, &rrm, have, + error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno); if (error) goto out_bag; @@ -552,14 +505,13 @@ xrep_refc_find_refcounts( } } - ASSERT(stack_sz == 0); + ASSERT(rcbag_count(rcstack) == 0); out_bag: - xfarray_destroy(rmap_bag); + rcbag_free(&rcstack); out_cur: xchk_ag_btcur_free(&sc->sa); return error; } -#undef RRM_NEXT /* Retrieve refcountbt data for bulk load. */ STATIC int diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index dd1c89e8714c..ce082d941459 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -89,6 +89,7 @@ int xrep_reset_perag_resv(struct xfs_scrub *sc); int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); int xrep_metadata_inode_forks(struct xfs_scrub *sc); int xrep_setup_ag_rmapbt(struct xfs_scrub *sc); +int xrep_setup_ag_refcountbt(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -186,6 +187,7 @@ xrep_setup_nothing( } #define xrep_setup_ag_allocbt xrep_setup_nothing #define xrep_setup_ag_rmapbt xrep_setup_nothing +#define xrep_setup_ag_refcountbt xrep_setup_nothing #define xrep_setup_inode(sc, imap) ((void)0) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 74e87ed5eee1..6828c48b15e9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -44,6 +44,7 @@ #include "xfs_dahash_test.h" #include "xfs_rtbitmap.h" #include "scrub/stats.h" +#include "scrub/rcbag_btree.h" #include #include @@ -2060,10 +2061,14 @@ xfs_init_caches(void) if (error) goto out_destroy_log_ticket_cache; - error = xfs_defer_init_item_caches(); + error = rcbagbt_init_cur_cache(); if (error) goto out_destroy_btree_cur_cache; + error = xfs_defer_init_item_caches(); + if (error) + goto out_destroy_rcbagbt_cur_cache; + xfs_da_state_cache = kmem_cache_create("xfs_da_state", sizeof(struct xfs_da_state), 0, 0, NULL); @@ -2220,6 +2225,8 @@ xfs_init_caches(void) kmem_cache_destroy(xfs_da_state_cache); out_destroy_defer_item_cache: xfs_defer_destroy_item_caches(); + out_destroy_rcbagbt_cur_cache: + rcbagbt_destroy_cur_cache(); out_destroy_btree_cur_cache: xfs_btree_destroy_cur_caches(); out_destroy_log_ticket_cache: @@ -2257,6 +2264,7 @@ xfs_destroy_caches(void) kmem_cache_destroy(xfs_ifork_cache); kmem_cache_destroy(xfs_da_state_cache); xfs_defer_destroy_item_caches(); + rcbagbt_destroy_cur_cache(); xfs_btree_destroy_cur_caches(); kmem_cache_destroy(xfs_log_ticket_cache); kmem_cache_destroy(xfs_buf_cache); -- cgit v1.2.3 From ef2d4a00df38dfa79ce08fbd8c03278e2d87126a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:43 -0800 Subject: xfs: split tracepoint classes for deferred items We're about to start adding support for deferred log intent items for realtime extents, so split these four types into separate classes so that we can customize them as the transition happens. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_trace.h | 251 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 166 insertions(+), 85 deletions(-) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index e85ea1fdb9c1..5c16a88852e3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2655,7 +2655,25 @@ DEFINE_EVENT(xfs_defer_pending_class, name, \ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \ TP_ARGS(mp, dfp)) -DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, +DEFINE_DEFER_EVENT(xfs_defer_cancel); +DEFINE_DEFER_EVENT(xfs_defer_trans_roll); +DEFINE_DEFER_EVENT(xfs_defer_trans_abort); +DEFINE_DEFER_EVENT(xfs_defer_finish); +DEFINE_DEFER_EVENT(xfs_defer_finish_done); + +DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); +DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); + +DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); + +DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int type, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, type, agbno, len), @@ -2680,92 +2698,17 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, __entry->agbno, __entry->len) ); -#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \ -DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \ +#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_free_extent_deferred_class, name, \ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ int type, \ xfs_agblock_t bno, \ xfs_extlen_t len), \ TP_ARGS(mp, agno, type, bno, len)) - -DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int op, - xfs_agblock_t agbno, - xfs_ino_t ino, - int whichfork, - xfs_fileoff_t offset, - xfs_filblks_t len, - xfs_exntst_t state), - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_ino_t, ino) - __field(xfs_agblock_t, agbno) - __field(int, whichfork) - __field(xfs_fileoff_t, l_loff) - __field(xfs_filblks_t, l_len) - __field(xfs_exntst_t, l_state) - __field(int, op) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->ino = ino; - __entry->agbno = agbno; - __entry->whichfork = whichfork; - __entry->l_loff = offset; - __entry->l_len = len; - __entry->l_state = state; - __entry->op = op; - ), - TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->op, - __entry->agno, - __entry->agbno, - __entry->ino, - __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), - __entry->l_loff, - __entry->l_len, - __entry->l_state) -); -#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \ -DEFINE_EVENT(xfs_map_extent_deferred_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - int op, \ - xfs_agblock_t agbno, \ - xfs_ino_t ino, \ - int whichfork, \ - xfs_fileoff_t offset, \ - xfs_filblks_t len, \ - xfs_exntst_t state), \ - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) - -DEFINE_DEFER_EVENT(xfs_defer_cancel); -DEFINE_DEFER_EVENT(xfs_defer_trans_roll); -DEFINE_DEFER_EVENT(xfs_defer_trans_abort); -DEFINE_DEFER_EVENT(xfs_defer_finish); -DEFINE_DEFER_EVENT(xfs_defer_finish_done); - -DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); -DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); - -DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); - -#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); -DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer); +DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred); DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp, @@ -2930,7 +2873,60 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT +DECLARE_EVENT_CLASS(xfs_rmap_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int op, + xfs_agblock_t agbno, + xfs_ino_t ino, + int whichfork, + xfs_fileoff_t offset, + xfs_filblks_t len, + xfs_exntst_t state), + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_agblock_t, agbno) + __field(int, whichfork) + __field(xfs_fileoff_t, l_loff) + __field(xfs_filblks_t, l_len) + __field(xfs_exntst_t, l_state) + __field(int, op) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->ino = ino; + __entry->agbno = agbno; + __entry->whichfork = whichfork; + __entry->l_loff = offset; + __entry->l_len = len; + __entry->l_state = state; + __entry->op = op; + ), + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->op, + __entry->agno, + __entry->agbno, + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->l_loff, + __entry->l_len, + __entry->l_state) +); +#define DEFINE_RMAP_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_rmap_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int op, \ + xfs_agblock_t agbno, \ + xfs_ino_t ino, \ + int whichfork, \ + xfs_fileoff_t offset, \ + xfs_filblks_t len, \ + xfs_exntst_t state), \ + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer); DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred); @@ -2950,7 +2946,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); /* deferred bmbt updates */ -#define DEFINE_BMAP_DEFERRED_EVENT DEFINE_RMAP_DEFERRED_EVENT +DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int op, + xfs_agblock_t agbno, + xfs_ino_t ino, + int whichfork, + xfs_fileoff_t offset, + xfs_filblks_t len, + xfs_exntst_t state), + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_ino_t, ino) + __field(xfs_agblock_t, agbno) + __field(int, whichfork) + __field(xfs_fileoff_t, l_loff) + __field(xfs_filblks_t, l_len) + __field(xfs_exntst_t, l_state) + __field(int, op) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->ino = ino; + __entry->agbno = agbno; + __entry->whichfork = whichfork; + __entry->l_loff = offset; + __entry->l_len = len; + __entry->l_state = state; + __entry->op = op; + ), + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->op, + __entry->agno, + __entry->agbno, + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->l_loff, + __entry->l_len, + __entry->l_state) +); +#define DEFINE_BMAP_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_bmap_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int op, \ + xfs_agblock_t agbno, \ + xfs_ino_t ino, \ + int whichfork, \ + xfs_fileoff_t offset, \ + xfs_filblks_t len, \ + xfs_exntst_t state), \ + TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); @@ -3327,7 +3376,39 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error); DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared); DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result); DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error); -#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT + +DECLARE_EVENT_CLASS(xfs_refcount_deferred_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + int type, xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, type, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, type) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->type = type; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->type, + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \ +DEFINE_EVENT(xfs_refcount_deferred_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + int type, \ + xfs_agblock_t bno, \ + xfs_extlen_t len), \ + TP_ARGS(mp, agno, type, bno, len)) DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer); DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred); -- cgit v1.2.3 From 2a15e7686094d1362b5026533b96f57ec989a245 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:43:53 -0800 Subject: xfs: clean up bmap log intent item tracepoint callsites Pass the incore bmap structure to the tracepoints instead of open-coding the argument passing. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 19 +++------------- fs/xfs/libxfs/xfs_bmap.h | 4 ++++ fs/xfs/xfs_trace.c | 1 + fs/xfs/xfs_trace.h | 56 +++++++++++++++++++++--------------------------- 4 files changed, 33 insertions(+), 47 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 06e329221cd5..b90a90305216 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6191,15 +6191,6 @@ __xfs_bmap_add( { struct xfs_bmap_intent *bi; - trace_xfs_bmap_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - ip->i_ino, whichfork, - bmap->br_startoff, - bmap->br_blockcount, - bmap->br_state); - bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; @@ -6207,6 +6198,8 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; + trace_xfs_bmap_defer(bi); + xfs_bmap_update_get_group(tp->t_mountp, bi); xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); return 0; @@ -6252,13 +6245,7 @@ xfs_bmap_finish_one( ASSERT(tp->t_highest_agno == NULLAGNUMBER); - trace_xfs_bmap_deferred(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock), - bi->bi_type, - XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock), - bi->bi_owner->i_ino, bi->bi_whichfork, - bmap->br_startoff, bmap->br_blockcount, - bmap->br_state); + trace_xfs_bmap_deferred(bi); if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) { xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 10b85865204d..0a2fd9304d1c 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -232,6 +232,10 @@ enum xfs_bmap_intent_type { XFS_BMAP_UNMAP, }; +#define XFS_BMAP_INTENT_STRINGS \ + { XFS_BMAP_MAP, "map" }, \ + { XFS_BMAP_UNMAP, "unmap" } + struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 3f253884fe5b..1a963382e5e9 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -38,6 +38,7 @@ #include "xfs_iomap.h" #include "xfs_buf_mem.h" #include "xfs_btree_mem.h" +#include "xfs_bmap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5c16a88852e3..2027ab56a1d8 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -81,6 +81,7 @@ struct xfs_icwalk; struct xfs_perag; struct xfbtree; struct xfs_btree_ops; +struct xfs_bmap_intent; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -2946,16 +2947,12 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); /* deferred bmbt updates */ +TRACE_DEFINE_ENUM(XFS_BMAP_MAP); +TRACE_DEFINE_ENUM(XFS_BMAP_UNMAP); + DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - int op, - xfs_agblock_t agbno, - xfs_ino_t ino, - int whichfork, - xfs_fileoff_t offset, - xfs_filblks_t len, - xfs_exntst_t state), - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state), + TP_PROTO(struct xfs_bmap_intent *bi), + TP_ARGS(bi), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) @@ -2968,22 +2965,26 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, __field(int, op) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->ino = ino; - __entry->agbno = agbno; - __entry->whichfork = whichfork; - __entry->l_loff = offset; - __entry->l_len = len; - __entry->l_state = state; - __entry->op = op; - ), - TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", + struct xfs_inode *ip = bi->bi_owner; + + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount, + bi->bi_bmap.br_startblock); + __entry->ino = ip->i_ino; + __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount, + bi->bi_bmap.br_startblock); + __entry->whichfork = bi->bi_whichfork; + __entry->l_loff = bi->bi_bmap.br_startoff; + __entry->l_len = bi->bi_bmap.br_blockcount; + __entry->l_state = bi->bi_bmap.br_state; + __entry->op = bi->bi_type; + ), + TP_printk("dev %d:%d op %s ino 0x%llx agno 0x%x agbno 0x%x %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->op, + __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS), + __entry->ino, __entry->agno, __entry->agbno, - __entry->ino, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, __entry->l_len, @@ -2991,15 +2992,8 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, ); #define DEFINE_BMAP_DEFERRED_EVENT(name) \ DEFINE_EVENT(xfs_bmap_deferred_class, name, \ - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ - int op, \ - xfs_agblock_t agbno, \ - xfs_ino_t ino, \ - int whichfork, \ - xfs_fileoff_t offset, \ - xfs_filblks_t len, \ - xfs_exntst_t state), \ - TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state)) + TP_PROTO(struct xfs_bmap_intent *bi), \ + TP_ARGS(bi)) DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer); DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred); -- cgit v1.2.3 From 372fe0b8ce4f1c22ec721e28a564449343b12a7e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:19 -0800 Subject: xfs: remove xfs_trans_set_bmap_flags Remove this single-use helper. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap_item.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e3c58090e976..d8601247ec8c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -236,29 +236,6 @@ xfs_bmap_update_diff_items( return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } -/* Set the map extent flags for this mapping. */ -static void -xfs_trans_set_bmap_flags( - struct xfs_map_extent *map, - enum xfs_bmap_intent_type type, - int whichfork, - xfs_exntst_t state) -{ - map->me_flags = 0; - switch (type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - map->me_flags = type; - break; - default: - ASSERT(0); - } - if (state == XFS_EXT_UNWRITTEN) - map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; - if (whichfork == XFS_ATTR_FORK) - map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; -} - /* Log bmap updates in the intent item. */ STATIC void xfs_bmap_update_log_item( @@ -281,8 +258,19 @@ xfs_bmap_update_log_item( map->me_startblock = bi->bi_bmap.br_startblock; map->me_startoff = bi->bi_bmap.br_startoff; map->me_len = bi->bi_bmap.br_blockcount; - xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork, - bi->bi_bmap.br_state); + + switch (bi->bi_type) { + case XFS_BMAP_MAP: + case XFS_BMAP_UNMAP: + map->me_flags = bi->bi_type; + break; + default: + ASSERT(0); + } + if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN) + map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; + if (bi->bi_whichfork == XFS_ATTR_FORK) + map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; } static struct xfs_log_item * -- cgit v1.2.3 From de47e4c9ad2de1a9e3ce9a460b49a77bee1c3c9b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:19 -0800 Subject: xfs: add a bi_entry helper Add a helper to translate from the item list head to the bmap_intent structure and use it so shorten assignments and avoid the need for extra local variables. Inspired-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap_item.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index d8601247ec8c..eb56dc5aae74 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -221,6 +221,11 @@ static const struct xfs_item_ops xfs_bud_item_ops = { .iop_intent = xfs_bud_item_intent, }; +static inline struct xfs_bmap_intent *bi_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_bmap_intent, bi_list); +} + /* Sort bmap intents by inode. */ static int xfs_bmap_update_diff_items( @@ -228,11 +233,9 @@ xfs_bmap_update_diff_items( const struct list_head *a, const struct list_head *b) { - struct xfs_bmap_intent *ba; - struct xfs_bmap_intent *bb; + struct xfs_bmap_intent *ba = bi_entry(a); + struct xfs_bmap_intent *bb = bi_entry(b); - ba = container_of(a, struct xfs_bmap_intent, bi_list); - bb = container_of(b, struct xfs_bmap_intent, bi_list); return ba->bi_owner->i_ino - bb->bi_owner->i_ino; } @@ -348,11 +351,9 @@ xfs_bmap_update_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_bmap_intent *bi; + struct xfs_bmap_intent *bi = bi_entry(item); int error; - bi = container_of(item, struct xfs_bmap_intent, bi_list); - error = xfs_bmap_finish_one(tp, bi); if (!error && bi->bi_bmap.br_blockcount > 0) { ASSERT(bi->bi_type == XFS_BMAP_UNMAP); @@ -377,9 +378,7 @@ STATIC void xfs_bmap_update_cancel_item( struct list_head *item) { - struct xfs_bmap_intent *bi; - - bi = container_of(item, struct xfs_bmap_intent, bi_list); + struct xfs_bmap_intent *bi = bi_entry(item); xfs_bmap_update_put_group(bi); kmem_cache_free(xfs_bmap_intent_cache, bi); -- cgit v1.2.3 From 5d3d0a6ad287746017b198a153d2de81ecc7491e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:20 -0800 Subject: xfs: reuse xfs_bmap_update_cancel_item Reuse xfs_bmap_update_cancel_item to put the AG/RTG and free the item in a few places that currently open code the logic. Inspired-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap_item.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index eb56dc5aae74..8df5e370c0bb 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -343,6 +343,17 @@ xfs_bmap_update_put_group( xfs_perag_intent_put(bi->bi_pag); } +/* Cancel a deferred bmap update. */ +STATIC void +xfs_bmap_update_cancel_item( + struct list_head *item) +{ + struct xfs_bmap_intent *bi = bi_entry(item); + + xfs_bmap_update_put_group(bi); + kmem_cache_free(xfs_bmap_intent_cache, bi); +} + /* Process a deferred bmap update. */ STATIC int xfs_bmap_update_finish_item( @@ -360,8 +371,7 @@ xfs_bmap_update_finish_item( return -EAGAIN; } - xfs_bmap_update_put_group(bi); - kmem_cache_free(xfs_bmap_intent_cache, bi); + xfs_bmap_update_cancel_item(item); return error; } @@ -373,17 +383,6 @@ xfs_bmap_update_abort_intent( xfs_bui_release(BUI_ITEM(intent)); } -/* Cancel a deferred bmap update. */ -STATIC void -xfs_bmap_update_cancel_item( - struct list_head *item) -{ - struct xfs_bmap_intent *bi = bi_entry(item); - - xfs_bmap_update_put_group(bi); - kmem_cache_free(xfs_bmap_intent_cache, bi); -} - /* Is this recovered BUI ok? */ static inline bool xfs_bui_validate( -- cgit v1.2.3 From 80284115854e60686b2e0183b31bb303ae69aa8c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:21 -0800 Subject: xfs: move xfs_bmap_defer_add to xfs_bmap_item.c Move the code that adds the incore xfs_bmap_item deferred work data to a transaction live with the BUI log item code. This means that the file mapping code no longer has to know about the inner workings of the BUI log items. As a consequence, we can hide the _get_group helper. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 6 ++---- fs/xfs/libxfs/xfs_bmap.h | 3 --- fs/xfs/xfs_bmap_item.c | 15 ++++++++++++++- fs/xfs/xfs_bmap_item.h | 4 ++++ 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b90a90305216..fb99b75f02e2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -37,6 +37,7 @@ #include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" +#include "xfs_bmap_item.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -6198,10 +6199,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; - trace_xfs_bmap_defer(bi); - - xfs_bmap_update_get_group(tp->t_mountp, bi); - xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); + xfs_bmap_defer_add(tp, bi); return 0; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 0a2fd9304d1c..325cc232a415 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -245,9 +245,6 @@ struct xfs_bmap_intent { struct xfs_bmbt_irec bi_bmap; }; -void xfs_bmap_update_get_group(struct xfs_mount *mp, - struct xfs_bmap_intent *bi); - int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_bmbt_irec *imap); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 8df5e370c0bb..a47cbce36482 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -25,6 +25,7 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" #include "xfs_ag.h" +#include "xfs_trace.h" struct kmem_cache *xfs_bui_cache; struct kmem_cache *xfs_bud_cache; @@ -316,7 +317,7 @@ xfs_bmap_update_create_done( } /* Take a passive ref to the AG containing the space we're mapping. */ -void +static inline void xfs_bmap_update_get_group( struct xfs_mount *mp, struct xfs_bmap_intent *bi) @@ -335,6 +336,18 @@ xfs_bmap_update_get_group( bi->bi_pag = xfs_perag_intent_get(mp, agno); } +/* Add this deferred BUI to the transaction. */ +void +xfs_bmap_defer_add( + struct xfs_trans *tp, + struct xfs_bmap_intent *bi) +{ + trace_xfs_bmap_defer(bi); + + xfs_bmap_update_get_group(tp->t_mountp, bi); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); +} + /* Release a passive AG ref after finishing mapping work. */ static inline void xfs_bmap_update_put_group( diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 3fafd3881a0b..6fee6a508343 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -68,4 +68,8 @@ struct xfs_bud_log_item { extern struct kmem_cache *xfs_bui_cache; extern struct kmem_cache *xfs_bud_cache; +struct xfs_bmap_intent; + +void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi); + #endif /* __XFS_BMAP_ITEM_H__ */ -- cgit v1.2.3 From 2b6a5ec26887cba195022286b039f2cc0ec683b1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:22 -0800 Subject: xfs: fix xfs_bunmapi to allow unmapping of partial rt extents When XFS_BMAPI_REMAP is passed to bunmapi, that means that we want to remove part of a block mapping without touching the allocator. For realtime files with rtextsize > 1, that also means that we should skip all the code that changes a partial remove request into an unwritten extent conversion. IOWs, bunmapi in this mode should handle removing the mapping from the rt file and nothing else. Note that XFS_BMAPI_REMAP callers are required to decrement the reference count and/or free the space manually. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index fb99b75f02e2..939947d00bc9 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5459,7 +5459,7 @@ __xfs_bunmapi( if (del.br_startoff + del.br_blockcount > end + 1) del.br_blockcount = end + 1 - del.br_startoff; - if (!isrt) + if (!isrt || (flags & XFS_BMAPI_REMAP)) goto delete; mod = xfs_rtb_to_rtxoff(mp, @@ -5477,7 +5477,7 @@ __xfs_bunmapi( * This piece is unwritten, or we're not * using unwritten extents. Skip over it. */ - ASSERT(end >= mod); + ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod); end -= mod > del.br_blockcount ? del.br_blockcount : mod; if (end < got.br_startoff && -- cgit v1.2.3 From c75f1a2c154979287ee12c336e2b8c3122832bf7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:22 -0800 Subject: xfs: add a xattr_entry helper Add a helper to translate from the item list head to the attr_intent item structure and use it so shorten assignments and avoid the need for extra local variables. Inspired-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e14e229fc712..9b4c61e1c22e 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -391,6 +391,11 @@ xfs_attr_free_item( kmem_cache_free(xfs_attr_intent_cache, attr); } +static inline struct xfs_attr_intent *attri_entry(const struct list_head *e) +{ + return list_entry(e, struct xfs_attr_intent, xattri_list); +} + /* Process an attr. */ STATIC int xfs_attr_finish_item( @@ -399,11 +404,10 @@ xfs_attr_finish_item( struct list_head *item, struct xfs_btree_cur **state) { - struct xfs_attr_intent *attr; + struct xfs_attr_intent *attr = attri_entry(item); struct xfs_da_args *args; int error; - attr = container_of(item, struct xfs_attr_intent, xattri_list); args = attr->xattri_da_args; /* Reset trans after EAGAIN cycle since the transaction is new */ @@ -443,9 +447,8 @@ STATIC void xfs_attr_cancel_item( struct list_head *item) { - struct xfs_attr_intent *attr; + struct xfs_attr_intent *attr = attri_entry(item); - attr = container_of(item, struct xfs_attr_intent, xattri_list); xfs_attr_free_item(attr); } -- cgit v1.2.3 From 7302cda7f8b08062b11d2ba9ae0b4f3871fe3d46 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:23 -0800 Subject: xfs: add a realtime flag to the bmap update log redo items Extend the bmap update (BUI) log items with a new realtime flag that indicates that the updates apply against a realtime file's data fork. We'll wire up the actual code later. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_log_format.h | 4 +++- fs/xfs/xfs_bmap_item.c | 8 ++++++++ fs/xfs/xfs_trace.h | 23 ++++++++++++++++++----- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 269573c82808..16872972e1e9 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -838,10 +838,12 @@ struct xfs_cud_log_format { #define XFS_BMAP_EXTENT_ATTR_FORK (1U << 31) #define XFS_BMAP_EXTENT_UNWRITTEN (1U << 30) +#define XFS_BMAP_EXTENT_REALTIME (1U << 29) #define XFS_BMAP_EXTENT_FLAGS (XFS_BMAP_EXTENT_TYPE_MASK | \ XFS_BMAP_EXTENT_ATTR_FORK | \ - XFS_BMAP_EXTENT_UNWRITTEN) + XFS_BMAP_EXTENT_UNWRITTEN | \ + XFS_BMAP_EXTENT_REALTIME) /* * This is the structure used to lay out an bui log item in the diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index a47cbce36482..701166da7f8a 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -275,6 +275,8 @@ xfs_bmap_update_log_item( map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN; if (bi->bi_whichfork == XFS_ATTR_FORK) map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK; + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) + map->me_flags |= XFS_BMAP_EXTENT_REALTIME; } static struct xfs_log_item * @@ -324,6 +326,9 @@ xfs_bmap_update_get_group( { xfs_agnumber_t agno; + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) + return; + agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock); /* @@ -353,6 +358,9 @@ static inline void xfs_bmap_update_put_group( struct xfs_bmap_intent *bi) { + if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork)) + return; + xfs_perag_intent_put(bi->bi_pag); } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2027ab56a1d8..56b07d8ed431 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2955,9 +2955,11 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, TP_ARGS(bi), TP_STRUCT__entry( __field(dev_t, dev) + __field(dev_t, opdev) __field(xfs_agnumber_t, agno) __field(xfs_ino_t, ino) __field(xfs_agblock_t, agbno) + __field(xfs_fsblock_t, rtbno) __field(int, whichfork) __field(xfs_fileoff_t, l_loff) __field(xfs_filblks_t, l_len) @@ -2968,23 +2970,34 @@ DECLARE_EVENT_CLASS(xfs_bmap_deferred_class, struct xfs_inode *ip = bi->bi_owner; __entry->dev = ip->i_mount->m_super->s_dev; - __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount, - bi->bi_bmap.br_startblock); + if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) { + __entry->agno = 0; + __entry->agbno = 0; + __entry->rtbno = bi->bi_bmap.br_startblock; + __entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev; + } else { + __entry->agno = XFS_FSB_TO_AGNO(ip->i_mount, + bi->bi_bmap.br_startblock); + __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount, + bi->bi_bmap.br_startblock); + __entry->rtbno = 0; + __entry->opdev = __entry->dev; + } __entry->ino = ip->i_ino; - __entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount, - bi->bi_bmap.br_startblock); __entry->whichfork = bi->bi_whichfork; __entry->l_loff = bi->bi_bmap.br_startoff; __entry->l_len = bi->bi_bmap.br_blockcount; __entry->l_state = bi->bi_bmap.br_state; __entry->op = bi->bi_type; ), - TP_printk("dev %d:%d op %s ino 0x%llx agno 0x%x agbno 0x%x %s fileoff 0x%llx fsbcount 0x%llx state %d", + TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS), + MAJOR(__entry->opdev), MINOR(__entry->opdev), __entry->ino, __entry->agno, __entry->agbno, + __entry->rtbno, __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, __entry->l_len, -- cgit v1.2.3 From 1b5453baed3a43dd4726eda0e8a5618c56a4f3f7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:24 -0800 Subject: xfs: support recovering bmap intent items targetting realtime extents Now that we have reflink on the realtime device, bmap intent items have to support remapping extents on the realtime volume. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_bmap_item.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 701166da7f8a..d27859a684aa 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -435,6 +435,9 @@ xfs_bui_validate( if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len)) return false; + if (map->me_flags & XFS_BMAP_EXTENT_REALTIME) + return xfs_verify_rtbext(mp, map->me_startblock, map->me_len); + return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } @@ -510,6 +513,12 @@ xfs_bmap_recover_work( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); + if (!!(map->me_flags & XFS_BMAP_EXTENT_REALTIME) != + xfs_ifork_is_realtime(ip, work->bi_whichfork)) { + error = -EFSCORRUPTED; + goto err_cancel; + } + if (work->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else -- cgit v1.2.3 From 52f807067ba4a122e75bf1e0e0595c78e6a3d8b6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:44:32 -0800 Subject: xfs: support deferred bmap updates on the attr fork The deferred bmap update log item has always supported the attr fork, so plumb this in so that higher layers can access this. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 47 +++++++++++++++++++---------------------------- fs/xfs/libxfs/xfs_bmap.h | 4 ++-- fs/xfs/xfs_bmap_util.c | 8 ++++---- fs/xfs/xfs_reflink.c | 8 ++++---- 4 files changed, 29 insertions(+), 38 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 939947d00bc9..8d62fd626be5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6172,17 +6172,8 @@ del_cursor: return error; } -/* Deferred mapping is only for real extents in the data fork. */ -static bool -xfs_bmap_is_update_needed( - struct xfs_bmbt_irec *bmap) -{ - return bmap->br_startblock != HOLESTARTBLOCK && - bmap->br_startblock != DELAYSTARTBLOCK; -} - /* Record a bmap intent. */ -static int +static inline void __xfs_bmap_add( struct xfs_trans *tp, enum xfs_bmap_intent_type type, @@ -6192,6 +6183,11 @@ __xfs_bmap_add( { struct xfs_bmap_intent *bi; + if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) || + bmap->br_startblock == HOLESTARTBLOCK || + bmap->br_startblock == DELAYSTARTBLOCK) + return; + bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; @@ -6200,7 +6196,6 @@ __xfs_bmap_add( bi->bi_bmap = *bmap; xfs_bmap_defer_add(tp, bi); - return 0; } /* Map an extent into a file. */ @@ -6208,12 +6203,10 @@ void xfs_bmap_map_extent( struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_bmap_is_update_needed(PREV)) - return; - - __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -6221,12 +6214,10 @@ void xfs_bmap_unmap_extent( struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *PREV) { - if (!xfs_bmap_is_update_needed(PREV)) - return; - - __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV); + __xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV); } /* @@ -6240,29 +6231,29 @@ xfs_bmap_finish_one( { struct xfs_bmbt_irec *bmap = &bi->bi_bmap; int error = 0; + int flags = 0; + + if (bi->bi_whichfork == XFS_ATTR_FORK) + flags |= XFS_BMAPI_ATTRFORK; ASSERT(tp->t_highest_agno == NULLAGNUMBER); trace_xfs_bmap_deferred(bi); - if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK)) { - xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork); - return -EFSCORRUPTED; - } - - if (XFS_TEST_ERROR(false, tp->t_mountp, - XFS_ERRTAG_BMAP_FINISH_ONE)) + if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE)) return -EIO; switch (bi->bi_type) { case XFS_BMAP_MAP: error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, - bmap->br_blockcount, bmap->br_startblock, 0); + bmap->br_blockcount, bmap->br_startblock, + flags); bmap->br_blockcount = 0; break; case XFS_BMAP_UNMAP: error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff, - &bmap->br_blockcount, XFS_BMAPI_REMAP, 1); + &bmap->br_blockcount, flags | XFS_BMAPI_REMAP, + 1); break; default: ASSERT(0); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 325cc232a415..f7662595309d 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -247,9 +247,9 @@ struct xfs_bmap_intent { int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); + int whichfork, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, - struct xfs_bmbt_irec *imap); + int whichfork, struct xfs_bmbt_irec *imap); static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2fe306fd71fc..19e11d1da660 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1305,16 +1305,16 @@ xfs_swap_extent_rmap( } /* Remove the mapping from the donor file. */ - xfs_bmap_unmap_extent(tp, tip, &uirec); + xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec); /* Remove the mapping from the source file. */ - xfs_bmap_unmap_extent(tp, ip, &irec); + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec); /* Map the donor file's blocks into the source file. */ - xfs_bmap_map_extent(tp, ip, &uirec); + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec); /* Map the source file's blocks into the donor file. */ - xfs_bmap_map_extent(tp, tip, &irec); + xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec); error = xfs_defer_finish(tpp); tp = *tpp; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cd9a00fd16e7..7da0e8f961d3 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -806,7 +806,7 @@ xfs_reflink_end_cow_extent( * If the extent we're remapping is backed by storage (written * or not), unmap the extent and drop its refcount. */ - xfs_bmap_unmap_extent(tp, ip, &data); + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); xfs_refcount_decrease_extent(tp, &data); xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -data.br_blockcount); @@ -830,7 +830,7 @@ xfs_reflink_end_cow_extent( xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount); /* Map the new blocks into the data fork. */ - xfs_bmap_map_extent(tp, ip, &del); + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del); /* Charge this new data fork mapping to the on-disk quota. */ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT, @@ -1294,7 +1294,7 @@ xfs_reflink_remap_extent( * If the extent we're unmapping is backed by storage (written * or not), unmap the extent and drop its refcount. */ - xfs_bmap_unmap_extent(tp, ip, &smap); + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap); xfs_refcount_decrease_extent(tp, &smap); qdelta -= smap.br_blockcount; } else if (smap.br_startblock == DELAYSTARTBLOCK) { @@ -1319,7 +1319,7 @@ xfs_reflink_remap_extent( */ if (dmap_written) { xfs_refcount_increase_extent(tp, dmap); - xfs_bmap_map_extent(tp, ip, dmap); + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap); qdelta += dmap->br_blockcount; } -- cgit v1.2.3 From 6c8127e93e3ac9c2cf6a13b885dd2d057b7e7d50 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:45:00 -0800 Subject: xfs: xfs_bmap_finish_one should map unwritten extents properly The deferred bmap work state and the log item can transmit unwritten state, so the XFS_BMAP_MAP handler must map in extents with that unwritten state. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8d62fd626be5..d78e02a3b4d6 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6245,6 +6245,8 @@ xfs_bmap_finish_one( switch (bi->bi_type) { case XFS_BMAP_MAP: + if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN) + flags |= XFS_BMAPI_PREALLOC; error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff, bmap->br_blockcount, bmap->br_startblock, flags); -- cgit v1.2.3 From 622d88e2ad7960b83af38dabf6b848a22a5a1c1f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:45:01 -0800 Subject: xfs: move xfs_symlink_remote.c declarations to xfs_symlink_remote.h Move declarations for libxfs symlink functions into a separate header file like we do for most everything else. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 1 + fs/xfs/libxfs/xfs_inode_fork.c | 1 + fs/xfs/libxfs/xfs_shared.h | 13 ------------- fs/xfs/libxfs/xfs_symlink_remote.c | 2 +- fs/xfs/libxfs/xfs_symlink_remote.h | 22 ++++++++++++++++++++++ fs/xfs/scrub/inode_repair.c | 1 + fs/xfs/scrub/symlink.c | 1 + fs/xfs/xfs_symlink.c | 1 + 8 files changed, 28 insertions(+), 14 deletions(-) create mode 100644 fs/xfs/libxfs/xfs_symlink_remote.h diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d78e02a3b4d6..656c95a22f2e 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -38,6 +38,7 @@ #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" +#include "xfs_symlink_remote.h" struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 3ab0ea133557..7d660a973909 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,6 +26,7 @@ #include "xfs_types.h" #include "xfs_errortag.h" #include "xfs_health.h" +#include "xfs_symlink_remote.h" struct kmem_cache *xfs_ifork_cache; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index cab49e7116ec..dfd61fa8332e 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -182,19 +182,6 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ #define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ - -/* - * Symlink decoding/encoding functions - */ -int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); -int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, - uint32_t size, struct xfs_buf *bp); -void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, - struct xfs_inode *ip, struct xfs_ifork *ifp); -xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); - /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { /* Maximum inode count in this filesystem. */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 160aa20aa441..7c39cb0307d9 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -16,7 +16,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" - +#include "xfs_symlink_remote.h" /* * Each contiguous block has a header, so it is not just a simple pathlen diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h new file mode 100644 index 000000000000..c6f621a0ec05 --- /dev/null +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + */ +#ifndef __XFS_SYMLINK_REMOTE_H +#define __XFS_SYMLINK_REMOTE_H + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); +xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); + +#endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 7e859c412a5b..eab380e95ef4 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -37,6 +37,7 @@ #include "xfs_attr_leaf.h" #include "xfs_log_priv.h" #include "xfs_health.h" +#include "xfs_symlink_remote.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index ddff86713df3..31df0866f2ee 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -13,6 +13,7 @@ #include "xfs_inode.h" #include "xfs_symlink.h" #include "xfs_health.h" +#include "xfs_symlink_remote.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/health.h" diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index fb8c57b9d13d..38f569d3f47a 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -24,6 +24,7 @@ #include "xfs_ialloc.h" #include "xfs_error.h" #include "xfs_health.h" +#include "xfs_symlink_remote.h" /* ----- Kernel only functions below ----- */ int -- cgit v1.2.3 From 376b4f0522484f43660dab8e4e92b471863b49f9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:45:17 -0800 Subject: xfs: move remote symlink target read function to libxfs Move xfs_readlink_bmap_ilocked to xfs_symlink_remote.c so that the swapext code can use it to convert a remote format symlink back to shortform format after a metadata repair. While we're at it, fix a broken printf prefix. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_symlink_remote.c | 77 ++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_symlink_remote.h | 1 + fs/xfs/scrub/symlink.c | 2 +- fs/xfs/xfs_symlink.c | 76 +------------------------------------ fs/xfs/xfs_symlink.h | 1 - 5 files changed, 80 insertions(+), 77 deletions(-) diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 7c39cb0307d9..d88af5c3c79f 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -17,6 +17,9 @@ #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_symlink_remote.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_health.h" /* * Each contiguous block has a header, so it is not just a simple pathlen @@ -227,3 +230,77 @@ xfs_symlink_shortform_verify( return __this_address; return NULL; } + +/* Read a remote symlink target into the buffer. */ +int +xfs_symlink_remote_read( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_disk_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &bp, &xfs_symlink_buf_ops); + if (xfs_metadata_is_sick(error)) + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + if (error) + return error; + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_has_crc(mp)) { + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, + byte_cnt, bp)) { + xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); + error = -EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, cur_chunk, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_disk_size] = '\0'; + error = 0; + + out: + return error; +} diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h index c6f621a0ec05..bb83a8b8dfa6 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.h +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -18,5 +18,6 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); +int xfs_symlink_remote_read(struct xfs_inode *ip, char *link); #endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 31df0866f2ee..d77d8a9598f6 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -68,7 +68,7 @@ xchk_symlink( } /* Remote symlink; must read the contents. */ - error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); + error = xfs_symlink_remote_read(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 38f569d3f47a..3b40211f3123 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -26,80 +26,6 @@ #include "xfs_health.h" #include "xfs_symlink_remote.h" -/* ----- Kernel only functions below ----- */ -int -xfs_readlink_bmap_ilocked( - struct xfs_inode *ip, - char *link) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; - struct xfs_buf *bp; - xfs_daddr_t d; - char *cur_chunk; - int pathlen = ip->i_disk_size; - int nmaps = XFS_SYMLINK_MAPS; - int byte_cnt; - int n; - int error = 0; - int fsblocks = 0; - int offset; - - xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL); - - fsblocks = xfs_symlink_blocks(mp, pathlen); - error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); - if (error) - goto out; - - offset = 0; - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - - error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, - &bp, &xfs_symlink_buf_ops); - if (xfs_metadata_is_sick(error)) - xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); - if (error) - return error; - byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - if (pathlen < byte_cnt) - byte_cnt = pathlen; - - cur_chunk = bp->b_addr; - if (xfs_has_crc(mp)) { - if (!xfs_symlink_hdr_ok(ip->i_ino, offset, - byte_cnt, bp)) { - xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK); - error = -EFSCORRUPTED; - xfs_alert(mp, -"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", - offset, byte_cnt, ip->i_ino); - xfs_buf_relse(bp); - goto out; - - } - - cur_chunk += sizeof(struct xfs_dsymlink_hdr); - } - - memcpy(link + offset, cur_chunk, byte_cnt); - - pathlen -= byte_cnt; - offset += byte_cnt; - - xfs_buf_relse(bp); - } - ASSERT(pathlen == 0); - - link[ip->i_disk_size] = '\0'; - error = 0; - - out: - return error; -} - int xfs_readlink( struct xfs_inode *ip, @@ -141,7 +67,7 @@ xfs_readlink( memcpy(link, ip->i_df.if_data, pathlen + 1); error = 0; } else { - error = xfs_readlink_bmap_ilocked(ip, link); + error = xfs_symlink_remote_read(ip, link); } xfs_iunlock(ip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index d1ca1ce62a93..0d29a50e66fd 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -10,7 +10,6 @@ int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, umode_t mode, struct xfs_inode **ipp); -int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_inactive_symlink(struct xfs_inode *ip); -- cgit v1.2.3 From b8102b61f7b8929ad8043e4574a1e26276398041 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 12:48:20 -0800 Subject: xfs: move symlink target write function to libxfs Move xfs_symlink_write_target to xfs_symlink_remote.c so that kernel and mkfs can share the same function. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_symlink_remote.c | 76 ++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_symlink_remote.h | 3 ++ fs/xfs/xfs_symlink.c | 69 +++------------------------------- 3 files changed, 84 insertions(+), 64 deletions(-) diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index d88af5c3c79f..ffb1317a9212 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -304,3 +304,79 @@ xfs_symlink_remote_read( out: return error; } + +/* Write the symlink target into the inode. */ +int +xfs_symlink_write_target( + struct xfs_trans *tp, + struct xfs_inode *ip, + const char *target_path, + int pathlen, + xfs_fsblock_t fs_blocks, + uint resblks) +{ + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_mount *mp = tp->t_mountp; + const char *cur_chunk; + struct xfs_buf *bp; + xfs_daddr_t d; + int byte_cnt; + int nmaps; + int offset = 0; + int n; + int error; + + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= xfs_inode_data_fork_size(ip)) { + xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); + + ip->i_disk_size = pathlen; + ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + return 0; + } + + nmaps = XFS_SYMLINK_MAPS; + error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA, + resblks, mval, &nmaps); + if (error) + return error; + + ip->i_disk_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt, + bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + return 0; +} diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h index bb83a8b8dfa6..a63bd38ae4fa 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.h +++ b/fs/xfs/libxfs/xfs_symlink_remote.h @@ -19,5 +19,8 @@ void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); int xfs_symlink_remote_read(struct xfs_inode *ip, char *link); +int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip, + const char *target_path, int pathlen, xfs_fsblock_t fs_blocks, + uint resblks); #endif /* __XFS_SYMLINK_REMOTE_H */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 3b40211f3123..3e376d24c7c1 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -93,15 +93,7 @@ xfs_symlink( int error = 0; int pathlen; bool unlock_dp_on_error = false; - xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; - int nmaps; - struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; - xfs_daddr_t d; - const char *cur_chunk; - int byte_cnt; - int n; - struct xfs_buf *bp; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -189,62 +181,11 @@ xfs_symlink( xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); resblks -= XFS_IALLOC_SPACE_RES(mp); - /* - * If the symlink will fit into the inode, write it inline. - */ - if (pathlen <= xfs_inode_data_fork_size(ip)) { - xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); - - ip->i_disk_size = pathlen; - ip->i_df.if_format = XFS_DINODE_FMT_LOCAL; - xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - } else { - int offset; - - first_fsb = 0; - nmaps = XFS_SYMLINK_MAPS; - - error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_METADATA, resblks, mval, &nmaps); - if (error) - goto out_trans_cancel; - - resblks -= fs_blocks; - ip->i_disk_size = pathlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - cur_chunk = target_path; - offset = 0; - for (n = 0; n < nmaps; n++) { - char *buf; - - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0, &bp); - if (error) - goto out_trans_cancel; - bp->b_ops = &xfs_symlink_buf_ops; - - byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - byte_cnt = min(byte_cnt, pathlen); - - buf = bp->b_addr; - buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, - byte_cnt, bp); - - memcpy(buf, cur_chunk, byte_cnt); - - cur_chunk += byte_cnt; - pathlen -= byte_cnt; - offset += byte_cnt; - - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); - xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - - (char *)bp->b_addr); - } - ASSERT(pathlen == 0); - } + error = xfs_symlink_write_target(tp, ip, target_path, pathlen, + fs_blocks, resblks); + if (error) + goto out_trans_cancel; + resblks -= fs_blocks; i_size_write(VFS_I(ip), ip->i_disk_size); /* -- cgit v1.2.3 From 1e5efd72a29e6d2aa70b0219f1786834ad14d005 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 22 Feb 2024 21:48:17 -0800 Subject: xfs: fix log recovery erroring out on refcount recovery failure Per the comment in the error case of xfs_reflink_recover_cow, zero out any error (after shutting down the log) so that we actually kill any new intent items that might have gotten logged by later recovery steps. Discovered by xfs/434, which few people actually seem to run. Fixes: 2c1e31ed5c88 ("xfs: place intent recovery under NOFS allocation context") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log_recover.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index cd134830a695..13f1d2e91540 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3532,6 +3532,7 @@ xlog_recover_finish( * and AIL. */ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + error = 0; goto out_error; } -- cgit v1.2.3 From e610e856b938a1fc86e7ee83ad2f39716082bca7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Feb 2024 22:01:40 -0800 Subject: xfs: fix scrub stats file permissions When the kernel is in lockdown mode, debugfs will only show files that are world-readable and cannot be written, mmaped, or used with ioctl. That more or less describes the scrub stats file, except that the permissions are wrong -- they should be 0444, not 0644. You can't write the stats file, so the 0200 makes no sense. Meanwhile, the clear_stats file is only writable, but it got mode 0400 instead of 0200, which would make more sense. Fix both files so that they make sense. Fixes: d7a74cad8f451 ("xfs: track usage statistics of online fsck") Signed-off-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/scrub/stats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index b4ef1ebe28ab..42cafbed94ac 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -331,9 +331,9 @@ xchk_stats_register( if (!cs->cs_debugfs) return; - debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, + debugfs_create_file("stats", 0444, cs->cs_debugfs, cs, &scrub_stats_fops); - debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, + debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs, &clear_scrub_stats_fops); } -- cgit v1.2.3 From 3aca0676a1141c4d198f8b3c934435941ba84244 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 27 Feb 2024 11:05:31 +1100 Subject: xfs: xfs_btree_bload_prep_block() should use __GFP_NOFAIL This was missed in the conversion from KM* flags. Reported-by: Dan Carpenter Fixes: 10634530f7ba ("xfs: convert kmem_zalloc() to kzalloc()") Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_btree_staging.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index d721719f759a..694929703152 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -303,7 +303,7 @@ xfs_btree_bload_prep_block( /* Allocate a new incore btree root block. */ new_size = bbl->iroot_size(cur, level, nr_this_block, priv); - ifp->if_broot = kzalloc(new_size, GFP_KERNEL); + ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); ifp->if_broot_bytes = (int)new_size; /* Initialize it and send it out. */ -- cgit v1.2.3 From b8c0d6fa4165fc5b98c191c6643cda40e7a1d420 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 27 Feb 2024 14:01:26 +1100 Subject: xfs: use kvfree() in xlog_cil_free_logvec() The xfs_log_vec items are allocated by xlog_kvmalloc(), and so need to be freed with kvfree. This was missed when coverting from the kmem_free() API. Fixes: 49292576136f ("xfs: convert kmem_free() for kvmalloc users to kvfree()") Reported-by: Darrick J. Wong Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Tested-by: Darrick J. Wong Signed-off-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/xfs_log_cil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 8c3b09777006..73f5b7f628f4 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -703,7 +703,7 @@ xlog_cil_free_logvec( while (!list_empty(lv_chain)) { lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list); list_del_init(&lv->lv_list); - kfree(lv); + kvfree(lv); } } @@ -1544,7 +1544,7 @@ xlog_cil_process_intents( set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); trace_xfs_cil_whiteout_mark(ilip); len += ilip->li_lv->lv_bytes; - kfree(ilip->li_lv); + kvfree(ilip->li_lv); ilip->li_lv = NULL; xfs_trans_del_item(lip); -- cgit v1.2.3 From 69fc23efc7e5030194ecaf4c108d4c23cfcd1a21 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Tue, 27 Feb 2024 14:03:30 +0900 Subject: kernel-doc: Add unary operator * to $type_param_ref In kernel-doc comments, unary operator * collides with Sphinx/ docutil's markdown for emphasizing. This resulted in additional warnings from "make htmldocs": WARNING: Inline emphasis start-string without end-string. , as reported recently [1]. Those have been worked around either by escaping * (like \*param) or by using inline-literal form of ``*param``, both of which are specific to Sphinx/docutils. Such workarounds are against the kenrel-doc's ideal and should better be avoided. Instead, add "*" to the list of unary operators kernel-doc recognizes and make the form of *@param available in kernel-doc comments. Reported-by: Stephen Rothwell Link: [1] https://lore.kernel.org/r/20240223153636.41358be5@canb.auug.org.au/ Acked-by: Christoph Hellwig Signed-off-by: Akira Yokosawa Cc: Jonathan Corbet Cc: Mauro Carvalho Chehab Acked-by: Jonathan Corbet Signed-off-by: Chandan Babu R --- scripts/kernel-doc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index e8aefd258a29..d2f3fa3505c6 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -64,7 +64,7 @@ my $type_constant = '\b``([^\`]+)``\b'; my $type_constant2 = '\%([-_\w]+)'; my $type_func = '(\w+)\(\)'; my $type_param = '\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; -my $type_param_ref = '([\!~]?)\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; +my $type_param_ref = '([\!~\*]?)\@(\w*((\.\w+)|(->\w+))*(\.\.\.)?)'; my $type_fp_param = '\@(\w+)\(\)'; # Special RST handling for func ptr params my $type_fp_param2 = '\@(\w+->\S+)\(\)'; # Special RST handling for structs with func ptr params my $type_env = '(\$\w+)'; -- cgit v1.2.3 From 8d4dd9d741c330119ae14f688bfc4fb602b17e19 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Tue, 27 Feb 2024 14:06:48 +0900 Subject: mm/shmem.c: Use new form of *@param in kernel-doc Use the form of *@param which kernel-doc recognizes now. This resolves the warnings from "make htmldocs" as reported in [1]. Reported-by: Stephen Rothwell Link: [1] https://lore.kernel.org/r/20240223153636.41358be5@canb.auug.org.au/ Acked-by: Christoph Hellwig Signed-off-by: Akira Yokosawa Signed-off-by: Chandan Babu R --- mm/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index fb76da93d369..cf3689926418 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2152,8 +2152,8 @@ unlock: * There is no need to reserve space before calling folio_mark_dirty(). * * When no folio is found, the behavior depends on @sgp: - * - for SGP_READ, *foliop is %NULL and 0 is returned - * - for SGP_NOALLOC, *foliop is %NULL and -ENOENT is returned + * - for SGP_READ, *@foliop is %NULL and 0 is returned + * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned * - for all other flags a new folio is allocated, inserted into the * page cache and returned locked in @foliop. * -- cgit v1.2.3 From 75bcffbb9e7563259b7aed0fa77459d6a3a35627 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 6 Mar 2024 12:12:46 +1100 Subject: xfs: shrink failure needs to hold AGI buffer Chandan reported a AGI/AGF lock order hang on xfs/168 during recent testing. The cause of the problem was the task running xfs_growfs to shrink the filesystem. A failure occurred trying to remove the free space from the btrees that the shrink would make disappear, and that meant it ran the error handling for a partial failure. This error path involves restoring the per-ag block reservations, and that requires calculating the amount of space needed to be reserved for the free inode btree. The growfs operation hung here: [18679.536829] down+0x71/0xa0 [18679.537657] xfs_buf_lock+0xa4/0x290 [xfs] [18679.538731] xfs_buf_find_lock+0xf7/0x4d0 [xfs] [18679.539920] xfs_buf_lookup.constprop.0+0x289/0x500 [xfs] [18679.542628] xfs_buf_get_map+0x2b3/0xe40 [xfs] [18679.547076] xfs_buf_read_map+0xbb/0x900 [xfs] [18679.562616] xfs_trans_read_buf_map+0x449/0xb10 [xfs] [18679.569778] xfs_read_agi+0x1cd/0x500 [xfs] [18679.573126] xfs_ialloc_read_agi+0xc2/0x5b0 [xfs] [18679.578708] xfs_finobt_calc_reserves+0xe7/0x4d0 [xfs] [18679.582480] xfs_ag_resv_init+0x2c5/0x490 [xfs] [18679.586023] xfs_ag_shrink_space+0x736/0xd30 [xfs] [18679.590730] xfs_growfs_data_private.isra.0+0x55e/0x990 [xfs] [18679.599764] xfs_growfs_data+0x2f1/0x410 [xfs] [18679.602212] xfs_file_ioctl+0xd1e/0x1370 [xfs] trying to get the AGI lock. The AGI lock was held by a fstress task trying to do an inode allocation, and it was waiting on the AGF lock to allocate a new inode chunk on disk. Hence deadlock. The fix for this is for the growfs code to hold the AGI over the transaction roll it does in the error path. It already holds the AGF locked across this, and that is what causes the lock order inversion in the xfs_ag_resv_init() call. Reported-by: Chandan Babu R Fixes: 46141dc891f7 ("xfs: introduce xfs_ag_shrink_space()") Signed-off-by: Dave Chinner Reviewed-by: Gao Xiang Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_ag.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index d728709054b2..dc1873f76bff 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -975,14 +975,23 @@ xfs_ag_shrink_space( if (error) { /* - * if extent allocation fails, need to roll the transaction to + * If extent allocation fails, need to roll the transaction to * ensure that the AGFL fixup has been committed anyway. + * + * We need to hold the AGF across the roll to ensure nothing can + * access the AG for allocation until the shrink is fully + * cleaned up. And due to the resetting of the AG block + * reservation space needing to lock the AGI, we also have to + * hold that so we don't get AGI/AGF lock order inversions in + * the error handling path. */ xfs_trans_bhold(*tpp, agfbp); + xfs_trans_bhold(*tpp, agibp); err2 = xfs_trans_roll(tpp); if (err2) return err2; xfs_trans_bjoin(*tpp, agfbp); + xfs_trans_bjoin(*tpp, agibp); goto resv_init_out; } -- cgit v1.2.3