From 88591e7f06a42b241ca2a3e3e7067a8f17661947 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 2 Jul 2022 02:10:52 +1000 Subject: xfs: use the CIL space used counter for emptiness checks In the next patches we are going to make the CIL list itself per-cpu, and so we cannot use list_empty() to check is the list is empty. Replace the list_empty() checks with a flag in the CIL to indicate we have committed at least one transaction to the CIL and hence the CIL is not empty. We need this flag to be an atomic so that we can clear it without holding any locks in the commit fast path, but we also need to be careful to avoid atomic operations in the fast path. Hence we use the fact that test_bit() is not an atomic op to first check if the flag is set and then run the atomic test_and_clear_bit() operation to clear it and steal the initial unit reservation for the CIL context checkpoint. When we are switching to a new context in a push, we place the setting of the XLOG_CIL_EMPTY flag under the xc_push_lock. THis allows all the other places that need to check whether the CIL is empty to use test_bit() and still be serialised correctly with the CIL context swaps that set the bit. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 43 ++++++++++++++++++++++++------------------- fs/xfs/xfs_log_priv.h | 4 ++++ 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index db6cb7800251..36c0ce77d41b 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -61,7 +61,7 @@ xlog_item_in_current_chkpt( struct xfs_cil *cil, struct xfs_log_item *lip) { - if (list_empty(&lip->li_cil)) + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) return false; /* @@ -102,6 +102,7 @@ xlog_cil_ctx_switch( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { + set_bit(XLOG_CIL_EMPTY, &cil->xc_flags); ctx->sequence = ++cil->xc_current_sequence; ctx->cil = cil; cil->xc_ctx = ctx; @@ -468,13 +469,12 @@ xlog_cil_insert_items( list_splice_init(&tp->t_busy, &ctx->busy_extents); /* - * Now transfer enough transaction reservation to the context ticket - * for the checkpoint. The context ticket is special - the unit - * reservation has to grow as well as the current reservation as we - * steal from tickets so we can correctly determine the space used - * during the transaction commit. + * We need to take the CIL checkpoint unit reservation on the first + * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't + * unnecessarily do an atomic op in the fast path here. */ - if (ctx->ticket->t_curr_res == 0) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) && + test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { ctx_res = ctx->ticket->t_unit_res; ctx->ticket->t_curr_res = ctx_res; tp->t_ticket->t_curr_res -= ctx_res; @@ -1054,7 +1054,7 @@ xlog_cil_push_work( * move on to a new sequence number and so we have to be able to push * this sequence again later. */ - if (list_empty(&cil->xc_cil)) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { cil->xc_push_seq = 0; spin_unlock(&cil->xc_push_lock); goto out_skip; @@ -1235,9 +1235,10 @@ xlog_cil_push_background( /* * The cil won't be empty because we are called while holding the - * context lock so whatever we added to the CIL will still be there + * context lock so whatever we added to the CIL will still be there. */ ASSERT(!list_empty(&cil->xc_cil)); + ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); /* * Don't do a background push if we haven't used up all the @@ -1334,7 +1335,8 @@ xlog_cil_push_now( * If the CIL is empty or we've already pushed the sequence then * there's no more work that we need to do. */ - if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) || + push_seq <= cil->xc_push_seq) { spin_unlock(&cil->xc_push_lock); return; } @@ -1352,7 +1354,7 @@ xlog_cil_empty( bool empty = false; spin_lock(&cil->xc_push_lock); - if (list_empty(&cil->xc_cil)) + if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) empty = true; spin_unlock(&cil->xc_push_lock); return empty; @@ -1568,7 +1570,7 @@ restart: * we would have found the context on the committing list. */ if (sequence == cil->xc_current_sequence && - !list_empty(&cil->xc_cil)) { + !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { spin_unlock(&cil->xc_push_lock); goto restart; } @@ -1636,14 +1638,17 @@ void xlog_cil_destroy( struct xlog *log) { - if (log->l_cilp->xc_ctx) { - if (log->l_cilp->xc_ctx->ticket) - xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); - kmem_free(log->l_cilp->xc_ctx); + struct xfs_cil *cil = log->l_cilp; + + if (cil->xc_ctx) { + if (cil->xc_ctx->ticket) + xfs_log_ticket_put(cil->xc_ctx->ticket); + kmem_free(cil->xc_ctx); } - ASSERT(list_empty(&log->l_cilp->xc_cil)); - destroy_workqueue(log->l_cilp->xc_push_wq); - kmem_free(log->l_cilp); + ASSERT(list_empty(&cil->xc_cil)); + ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); + destroy_workqueue(cil->xc_push_wq); + kmem_free(cil); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 686c01eb3661..8fad33ea2582 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -248,6 +248,7 @@ struct xfs_cil_ctx { */ struct xfs_cil { struct xlog *xc_log; + unsigned long xc_flags; struct list_head xc_cil; spinlock_t xc_cil_lock; struct workqueue_struct *xc_push_wq; @@ -265,6 +266,9 @@ struct xfs_cil { wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; +/* xc_flags bit values */ +#define XLOG_CIL_EMPTY 1 + /* * The amount of log space we allow the CIL to aggregate is difficult to size. * Whatever we choose, we have to make sure we can get a reservation for the -- cgit v1.2.3 From 12380d237b819bd6cf2183f10b55ab47cdaab5e6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 2 Jul 2022 02:11:52 +1000 Subject: xfs: lift init CIL reservation out of xc_cil_lock The xc_cil_lock is the most highly contended lock in XFS now. To start the process of getting rid of it, lift the initial reservation of the CIL log space out from under the xc_cil_lock. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 36c0ce77d41b..8a83d901e465 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -462,23 +462,19 @@ xlog_cil_insert_items( */ xlog_cil_insert_format_items(log, tp, &len); - spin_lock(&cil->xc_cil_lock); - - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) - list_splice_init(&tp->t_busy, &ctx->busy_extents); - /* * We need to take the CIL checkpoint unit reservation on the first * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't - * unnecessarily do an atomic op in the fast path here. + * unnecessarily do an atomic op in the fast path here. We don't need to + * hold the xc_cil_lock here to clear the XLOG_CIL_EMPTY bit as we are + * under the xc_ctx_lock here and that needs to be held exclusively to + * reset the XLOG_CIL_EMPTY bit. */ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) && - test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { + test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) ctx_res = ctx->ticket->t_unit_res; - ctx->ticket->t_curr_res = ctx_res; - tp->t_ticket->t_curr_res -= ctx_res; - } + + spin_lock(&cil->xc_cil_lock); /* do we need space for more log record headers? */ iclog_space = log->l_iclog_size - log->l_iclog_hsize; @@ -488,13 +484,12 @@ xlog_cil_insert_items( /* need to take into account split region headers, too */ split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header); ctx->ticket->t_unit_res += split_res; - ctx->ticket->t_curr_res += split_res; - tp->t_ticket->t_curr_res -= split_res; - ASSERT(tp->t_ticket->t_curr_res >= len); } - tp->t_ticket->t_curr_res -= len; - tp->t_ticket->t_curr_res += released_space; + tp->t_ticket->t_curr_res -= split_res + ctx_res + len; + ctx->ticket->t_curr_res += split_res + ctx_res; ctx->space_used += len; + + tp->t_ticket->t_curr_res += released_space; ctx->space_used -= released_space; /* @@ -532,6 +527,9 @@ xlog_cil_insert_items( list_move_tail(&lip->li_cil, &cil->xc_cil); } + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &ctx->busy_extents); spin_unlock(&cil->xc_cil_lock); if (tp->t_ticket->t_curr_res < 0) -- cgit v1.2.3 From 31151cc342dd9cc2c5a5954f3e7b2dcf2fb50f64 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 2 Jul 2022 02:12:52 +1000 Subject: xfs: rework per-iclog header CIL reservation For every iclog that a CIL push will use up, we need to ensure we have space reserved for the iclog header in each iclog. It is extremely difficult to do this accurately with a per-cpu counter without expensive summing of the counter in every commit. However, we know what the maximum CIL size is going to be because of the hard space limit we have, and hence we know exactly how many iclogs we are going to need to write out the CIL. We are constrained by the requirement that small transactions only have reservation space for a single iclog header built into them. At commit time we don't know how much of the current transaction reservation is made up of iclog header reservations as calculated by xfs_log_calc_unit_res() when the ticket was reserved. As larger reservations have multiple header spaces reserved, we can steal more than one iclog header reservation at a time, but we only steal the exact number needed for the given log vector size delta. As a result, we don't know exactly when we are going to steal iclog header reservations, nor do we know exactly how many we are going to need for a given CIL. To make things simple, start by calculating the worst case number of iclog headers a full CIL push will require. Record this into an atomic variable in the CIL. Then add a byte counter to the log ticket that records exactly how much iclog header space has been reserved in this ticket by xfs_log_calc_unit_res(). This tells us exactly how much space we can steal from the ticket at transaction commit time. Now, at transaction commit time, we can check if the CIL has a full iclog header reservation and, if not, steal the entire reservation the current ticket holds for iclog headers. This minimises the number of times we need to do atomic operations in the fast path, but still guarantees we get all the reservations we need. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 9 ++++++--- fs/xfs/xfs_log_cil.c | 55 +++++++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_log_priv.h | 20 ++++++++++--------- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ae904b21e9cc..e4416e80a25f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3406,7 +3406,8 @@ xfs_log_ticket_get( static int xlog_calc_unit_res( struct xlog *log, - int unit_bytes) + int unit_bytes, + int *niclogs) { int iclog_space; uint num_headers; @@ -3486,6 +3487,8 @@ xlog_calc_unit_res( /* roundoff padding for transaction data and one for commit record */ unit_bytes += 2 * log->l_iclog_roundoff; + if (niclogs) + *niclogs = num_headers; return unit_bytes; } @@ -3494,7 +3497,7 @@ xfs_log_calc_unit_res( struct xfs_mount *mp, int unit_bytes) { - return xlog_calc_unit_res(mp->m_log, unit_bytes); + return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL); } /* @@ -3512,7 +3515,7 @@ xlog_ticket_alloc( tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); - unit_res = xlog_calc_unit_res(log, unit_bytes); + unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs); atomic_set(&tic->t_ref, 1); tic->t_task = current; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 8a83d901e465..880ea9536f82 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -44,9 +44,20 @@ xlog_cil_ticket_alloc( * transaction overhead reservation from the first transaction commit. */ tic->t_curr_res = 0; + tic->t_iclog_hdrs = 0; return tic; } +static inline void +xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil) +{ + struct xlog *log = cil->xc_log; + + atomic_set(&cil->xc_iclog_hdrs, + (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) / + (log->l_iclog_size - log->l_iclog_hsize))); +} + /* * Check if the current log item was first committed in this sequence. * We can't rely on just the log item being in the CIL, we have to check @@ -102,6 +113,7 @@ xlog_cil_ctx_switch( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { + xlog_cil_set_iclog_hdr_count(cil); set_bit(XLOG_CIL_EMPTY, &cil->xc_flags); ctx->sequence = ++cil->xc_current_sequence; ctx->cil = cil; @@ -124,6 +136,7 @@ xlog_cil_init_post_recovery( { log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; + xlog_cil_set_iclog_hdr_count(log->l_cilp); } static inline int @@ -451,7 +464,6 @@ xlog_cil_insert_items( struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_item *lip; int len = 0; - int iclog_space; int iovhdr_res = 0, split_res = 0, ctx_res = 0; ASSERT(tp); @@ -474,19 +486,36 @@ xlog_cil_insert_items( test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) ctx_res = ctx->ticket->t_unit_res; - spin_lock(&cil->xc_cil_lock); - - /* do we need space for more log record headers? */ - iclog_space = log->l_iclog_size - log->l_iclog_hsize; - if (len > 0 && (ctx->space_used / iclog_space != - (ctx->space_used + len) / iclog_space)) { - split_res = (len + iclog_space - 1) / iclog_space; - /* need to take into account split region headers, too */ - split_res *= log->l_iclog_hsize + sizeof(struct xlog_op_header); - ctx->ticket->t_unit_res += split_res; + /* + * Check if we need to steal iclog headers. atomic_read() is not a + * locked atomic operation, so we can check the value before we do any + * real atomic ops in the fast path. If we've already taken the CIL unit + * reservation from this commit, we've already got one iclog header + * space reserved so we have to account for that otherwise we risk + * overrunning the reservation on this ticket. + * + * If the CIL is already at the hard limit, we might need more header + * space that originally reserved. So steal more header space from every + * commit that occurs once we are over the hard limit to ensure the CIL + * push won't run out of reservation space. + * + * This can steal more than we need, but that's OK. + */ + if (atomic_read(&cil->xc_iclog_hdrs) > 0 || + ctx->space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { + int split_res = log->l_iclog_hsize + + sizeof(struct xlog_op_header); + if (ctx_res) + ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1); + else + ctx_res = split_res * tp->t_ticket->t_iclog_hdrs; + atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs); } - tp->t_ticket->t_curr_res -= split_res + ctx_res + len; - ctx->ticket->t_curr_res += split_res + ctx_res; + + spin_lock(&cil->xc_cil_lock); + tp->t_ticket->t_curr_res -= ctx_res + len; + ctx->ticket->t_unit_res += ctx_res; + ctx->ticket->t_curr_res += ctx_res; ctx->space_used += len; tp->t_ticket->t_curr_res += released_space; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8fad33ea2582..74436482c28d 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -143,15 +143,16 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 typedef struct xlog_ticket { - struct list_head t_queue; /* reserve/write queue */ - struct task_struct *t_task; /* task that owns this ticket */ - xlog_tid_t t_tid; /* transaction identifier : 4 */ - atomic_t t_ref; /* ticket reference count : 4 */ - int t_curr_res; /* current reservation in bytes : 4 */ - int t_unit_res; /* unit reservation in bytes : 4 */ - char t_ocnt; /* original count : 1 */ - char t_cnt; /* current count : 1 */ - uint8_t t_flags; /* properties of reservation : 1 */ + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ + xlog_tid_t t_tid; /* transaction identifier */ + atomic_t t_ref; /* ticket reference count */ + int t_curr_res; /* current reservation */ + int t_unit_res; /* unit reservation */ + char t_ocnt; /* original unit count */ + char t_cnt; /* current unit count */ + uint8_t t_flags; /* properties of reservation */ + int t_iclog_hdrs; /* iclog hdrs in t_curr_res */ } xlog_ticket_t; /* @@ -249,6 +250,7 @@ struct xfs_cil_ctx { struct xfs_cil { struct xlog *xc_log; unsigned long xc_flags; + atomic_t xc_iclog_hdrs; struct list_head xc_cil; spinlock_t xc_cil_lock; struct workqueue_struct *xc_push_wq; -- cgit v1.2.3 From af1c2146a50b1ffe7e10cae1f7e64ab56b7f8c1f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 2 Jul 2022 02:13:52 +1000 Subject: xfs: introduce per-cpu CIL tracking structure The CIL push lock is highly contended on larger machines, becoming a hard bottleneck that about 700,000 transaction commits/s on >16p machines. To address this, start moving the CIL tracking infrastructure to utilise per-CPU structures. We need to track the space used, the amount of log reservation space reserved to write the CIL, the log items in the CIL and the busy extents that need to be completed by the CIL commit. This requires a couple of per-cpu counters, an unordered per-cpu list and a globally ordered per-cpu list. Create a per-cpu structure to hold these and all the management interfaces needed, as well as the hooks to handle hotplug CPUs. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 30 ++++++++++++++++++++++++++++-- fs/xfs/xfs_log_priv.h | 18 ++++++++++++++++++ fs/xfs/xfs_super.c | 1 + 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 880ea9536f82..c6d6322aabaa 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1617,6 +1617,26 @@ out_shutdown: return 0; } +/* + * Move dead percpu state to the relevant CIL context structures. + * + * We have to lock the CIL context here to ensure that nothing is modifying + * the percpu state, either addition or removal. Both of these are done under + * the CIL context lock, so grabbing that exclusively here will ensure we can + * safely drain the cilpcp for the CPU that is dying. + */ +void +xlog_cil_pcp_dead( + struct xlog *log, + unsigned int cpu) +{ + struct xfs_cil *cil = log->l_cilp; + + down_write(&cil->xc_ctx_lock); + /* move stuff on dead CPU to context */ + up_write(&cil->xc_ctx_lock); +} + /* * Perform initial CIL structure initialisation. */ @@ -1640,6 +1660,11 @@ xlog_cil_init( if (!cil->xc_push_wq) goto out_destroy_cil; + cil->xc_log = log; + cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp); + if (!cil->xc_pcp) + goto out_destroy_wq; + INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); @@ -1648,14 +1673,14 @@ xlog_cil_init( init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_start_wait); init_waitqueue_head(&cil->xc_commit_wait); - cil->xc_log = log; log->l_cilp = cil; ctx = xlog_cil_ctx_alloc(); xlog_cil_ctx_switch(cil, ctx); - return 0; +out_destroy_wq: + destroy_workqueue(cil->xc_push_wq); out_destroy_cil: kmem_free(cil); return -ENOMEM; @@ -1675,6 +1700,7 @@ xlog_cil_destroy( ASSERT(list_empty(&cil->xc_cil)); ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); + free_percpu(cil->xc_pcp); destroy_workqueue(cil->xc_push_wq); kmem_free(cil); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 74436482c28d..70483c78953e 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -231,6 +231,14 @@ struct xfs_cil_ctx { struct work_struct push_work; }; +/* + * Per-cpu CIL tracking items + */ +struct xlog_cil_pcp { + struct list_head busy_extents; + struct list_head log_items; +}; + /* * Committed Item List structure * @@ -266,6 +274,11 @@ struct xfs_cil { wait_queue_head_t xc_start_wait; xfs_csn_t xc_current_sequence; wait_queue_head_t xc_push_wait; /* background push throttle */ + + void __percpu *xc_pcp; /* percpu CIL structures */ +#ifdef CONFIG_HOTPLUG_CPU + struct list_head xc_pcp_list; +#endif } ____cacheline_aligned_in_smp; /* xc_flags bit values */ @@ -688,4 +701,9 @@ xlog_kvmalloc( return p; } +/* + * CIL CPU dead notifier + */ +void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu); + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aa977c7ea370..1e02ec67c3a0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -2213,6 +2213,7 @@ xfs_cpu_dead( list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { spin_unlock(&xfs_mount_list_lock); xfs_inodegc_cpu_dead(mp, cpu); + xlog_cil_pcp_dead(mp->m_log, cpu); spin_lock(&xfs_mount_list_lock); } spin_unlock(&xfs_mount_list_lock); -- cgit v1.2.3 From 7c8ade212120085439eddc4cfddfa29d41c3f426 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:50:59 +1000 Subject: xfs: implement percpu cil space used calculation Now that we have the CIL percpu structures in place, implement the space used counter as a per-cpu counter. We have to be really careful now about ensuring that the checks and updates run without arbitrary delays, which means they need to run with pre-emption disabled. We do this by careful placement of the get_cpu_ptr/put_cpu_ptr calls to access the per-cpu structures for that CPU. We need to be able to reliably detect that the CIL has reached the hard limit threshold so we can take extra reservations for the iclog headers when the space used overruns the original reservation. hence we factor out xlog_cil_over_hard_limit() from xlog_cil_push_background(). The global CIL space used is an atomic variable that is backed by per-cpu aggregation to minimise the number of atomic updates we do to the global state in the fast path. While we are under the soft limit, we aggregate only when the per-cpu aggregation is over the proportion of the soft limit assigned to that CPU. This means that all CPUs can use all but one byte of their aggregation threshold and we will not go over the soft limit. Hence once we detect that we've gone over both a per-cpu aggregation threshold and the soft limit, we know that we have only exceeded the soft limit by one per-cpu aggregation threshold. Even if all CPUs hit this at the same time, we can't be over the hard limit, so we can run an aggregation back into the atomic counter at this point and still be under the hard limit. At this point, we will be over the soft limit and hence we'll aggregate into the global atomic used space directly rather than the per-cpu counters, hence providing accurate detection of hard limit excursion for accounting and reservation purposes. Hence we get the best of both worlds - lockless, scalable per-cpu fast path plus accurate, atomic detection of hard limit excursion. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 176 +++++++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_log_priv.h | 4 +- 2 files changed, 149 insertions(+), 31 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c6d6322aabaa..2d16add7a8d4 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -108,6 +108,64 @@ xlog_cil_ctx_alloc(void) return ctx; } +/* + * Aggregate the CIL per cpu structures into global counts, lists, etc and + * clear the percpu state ready for the next context to use. This is called + * from the push code with the context lock held exclusively, hence nothing else + * will be accessing or modifying the per-cpu counters. + */ +static void +xlog_cil_push_pcp_aggregate( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + struct xlog_cil_pcp *cilpcp; + int cpu; + + for_each_online_cpu(cpu) { + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + + /* + * We're in the middle of switching cil contexts. Reset the + * counter we use to detect when the current context is nearing + * full. + */ + cilpcp->space_used = 0; + } +} + +/* + * Aggregate the CIL per-cpu space used counters into the global atomic value. + * This is called when the per-cpu counter aggregation will first pass the soft + * limit threshold so we can switch to atomic counter aggregation for accurate + * detection of hard limit traversal. + */ +static void +xlog_cil_insert_pcp_aggregate( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + struct xlog_cil_pcp *cilpcp; + int cpu; + int count = 0; + + /* Trigger atomic updates then aggregate only for the first caller */ + if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) + return; + + for_each_online_cpu(cpu) { + int old, prev; + + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + do { + old = cilpcp->space_used; + prev = cmpxchg(&cilpcp->space_used, old, 0); + } while (old != prev); + count += old; + } + atomic_add(count, &ctx->space_used); +} + static void xlog_cil_ctx_switch( struct xfs_cil *cil, @@ -115,6 +173,7 @@ xlog_cil_ctx_switch( { xlog_cil_set_iclog_hdr_count(cil); set_bit(XLOG_CIL_EMPTY, &cil->xc_flags); + set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags); ctx->sequence = ++cil->xc_current_sequence; ctx->cil = cil; cil->xc_ctx = ctx; @@ -447,6 +506,23 @@ insert: } } +/* + * The use of lockless waitqueue_active() requires that the caller has + * serialised itself against the wakeup call in xlog_cil_push_work(). That + * can be done by either holding the push lock or the context lock. + */ +static inline bool +xlog_cil_over_hard_limit( + struct xlog *log, + int32_t space_used) +{ + if (waitqueue_active(&log->l_cilp->xc_push_wait)) + return true; + if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) + return true; + return false; +} + /* * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate @@ -465,6 +541,8 @@ xlog_cil_insert_items( struct xfs_log_item *lip; int len = 0; int iovhdr_res = 0, split_res = 0, ctx_res = 0; + int space_used; + struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -474,6 +552,21 @@ xlog_cil_insert_items( */ xlog_cil_insert_format_items(log, tp, &len); + /* + * Subtract the space released by intent cancelation from the space we + * consumed so that we remove it from the CIL space and add it back to + * the current transaction reservation context. + */ + len -= released_space; + + /* + * Grab the per-cpu pointer for the CIL before we start any accounting. + * That ensures that we are running with pre-emption disabled and so we + * can't be scheduled away between split sample/update operations that + * are done without outside locking to serialise them. + */ + cilpcp = get_cpu_ptr(cil->xc_pcp); + /* * We need to take the CIL checkpoint unit reservation on the first * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't @@ -500,10 +593,14 @@ xlog_cil_insert_items( * push won't run out of reservation space. * * This can steal more than we need, but that's OK. + * + * The cil->xc_ctx_lock provides the serialisation necessary for safely + * calling xlog_cil_over_hard_limit() in this context. */ + space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len; if (atomic_read(&cil->xc_iclog_hdrs) > 0 || - ctx->space_used + len >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) { - int split_res = log->l_iclog_hsize + + xlog_cil_over_hard_limit(log, space_used)) { + split_res = log->l_iclog_hsize + sizeof(struct xlog_op_header); if (ctx_res) ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1); @@ -512,29 +609,31 @@ xlog_cil_insert_items( atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs); } - spin_lock(&cil->xc_cil_lock); - tp->t_ticket->t_curr_res -= ctx_res + len; - ctx->ticket->t_unit_res += ctx_res; - ctx->ticket->t_curr_res += ctx_res; - ctx->space_used += len; - - tp->t_ticket->t_curr_res += released_space; - ctx->space_used -= released_space; - /* - * If we've overrun the reservation, dump the tx details before we move - * the log items. Shutdown is imminent... + * Accurately account when over the soft limit, otherwise fold the + * percpu count into the global count if over the per-cpu threshold. */ - if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { - xfs_warn(log->l_mp, "Transaction log reservation overrun:"); - xfs_warn(log->l_mp, - " log items: %d bytes (iov hdrs: %d bytes)", - len, iovhdr_res); - xfs_warn(log->l_mp, " split region headers: %d bytes", - split_res); - xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); - xlog_print_trans(tp); + if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) { + atomic_add(len, &ctx->space_used); + } else if (cilpcp->space_used + len > + (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) { + space_used = atomic_add_return(cilpcp->space_used + len, + &ctx->space_used); + cilpcp->space_used = 0; + + /* + * If we just transitioned over the soft limit, we need to + * transition to the global atomic counter. + */ + if (space_used >= XLOG_CIL_SPACE_LIMIT(log)) + xlog_cil_insert_pcp_aggregate(cil, ctx); + } else { + cilpcp->space_used += len; } + put_cpu_ptr(cilpcp); + + spin_lock(&cil->xc_cil_lock); + ctx->ticket->t_curr_res += ctx_res; /* * Now (re-)position everything modified at the tail of the CIL. @@ -542,7 +641,6 @@ xlog_cil_insert_items( * the transaction commit. */ list_for_each_entry(lip, &tp->t_items, li_trans) { - /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; @@ -561,8 +659,22 @@ xlog_cil_insert_items( list_splice_init(&tp->t_busy, &ctx->busy_extents); spin_unlock(&cil->xc_cil_lock); - if (tp->t_ticket->t_curr_res < 0) + /* + * If we've overrun the reservation, dump the tx details before we move + * the log items. Shutdown is imminent... + */ + tp->t_ticket->t_curr_res -= ctx_res + len; + if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { + xfs_warn(log->l_mp, "Transaction log reservation overrun:"); + xfs_warn(log->l_mp, + " log items: %d bytes (iov hdrs: %d bytes)", + len, iovhdr_res); + xfs_warn(log->l_mp, " split region headers: %d bytes", + split_res); + xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); + xlog_print_trans(tp); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); + } } static void @@ -1076,6 +1188,8 @@ xlog_cil_push_work( if (waitqueue_active(&cil->xc_push_wait)) wake_up_all(&cil->xc_push_wait); + xlog_cil_push_pcp_aggregate(cil, ctx); + /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push @@ -1259,6 +1373,7 @@ xlog_cil_push_background( struct xlog *log) __releases(cil->xc_ctx_lock) { struct xfs_cil *cil = log->l_cilp; + int space_used = atomic_read(&cil->xc_ctx->space_used); /* * The cil won't be empty because we are called while holding the @@ -1271,7 +1386,7 @@ xlog_cil_push_background( * Don't do a background push if we haven't used up all the * space available yet. */ - if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) { + if (space_used < XLOG_CIL_SPACE_LIMIT(log)) { up_read(&cil->xc_ctx_lock); return; } @@ -1298,12 +1413,11 @@ xlog_cil_push_background( * dipping back down under the hard limit. * * The ctx->xc_push_lock provides the serialisation necessary for safely - * using the lockless waitqueue_active() check in this context. + * calling xlog_cil_over_hard_limit() in this context. */ - if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) || - waitqueue_active(&cil->xc_push_wait)) { + if (xlog_cil_over_hard_limit(log, space_used)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); - ASSERT(cil->xc_ctx->space_used < log->l_logsize); + ASSERT(space_used < log->l_logsize); xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); return; } @@ -1631,9 +1745,11 @@ xlog_cil_pcp_dead( unsigned int cpu) { struct xfs_cil *cil = log->l_cilp; + struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); down_write(&cil->xc_ctx_lock); - /* move stuff on dead CPU to context */ + atomic_add(cilpcp->space_used, &cil->xc_ctx->space_used); + cilpcp->space_used = 0; up_write(&cil->xc_ctx_lock); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 70483c78953e..f4c13704ef8c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -222,7 +222,7 @@ struct xfs_cil_ctx { xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ - int space_used; /* aggregate size of regions */ + atomic_t space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; @@ -235,6 +235,7 @@ struct xfs_cil_ctx { * Per-cpu CIL tracking items */ struct xlog_cil_pcp { + int32_t space_used; struct list_head busy_extents; struct list_head log_items; }; @@ -283,6 +284,7 @@ struct xfs_cil { /* xc_flags bit values */ #define XLOG_CIL_EMPTY 1 +#define XLOG_CIL_PCP_SPACE 2 /* * The amount of log space we allow the CIL to aggregate is difficult to size. -- cgit v1.2.3 From 1dd2a2c18e314ad89200f8296c86dd4ecd53dea6 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:51:59 +1000 Subject: xfs: track CIL ticket reservation in percpu structure To get it out from under the cil spinlock. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 16 ++++++++++++---- fs/xfs/xfs_log_priv.h | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 2d16add7a8d4..e38e10082da2 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -125,6 +125,9 @@ xlog_cil_push_pcp_aggregate( for_each_online_cpu(cpu) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + ctx->ticket->t_curr_res += cilpcp->space_reserved; + cilpcp->space_reserved = 0; + /* * We're in the middle of switching cil contexts. Reset the * counter we use to detect when the current context is nearing @@ -608,6 +611,7 @@ xlog_cil_insert_items( ctx_res = split_res * tp->t_ticket->t_iclog_hdrs; atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs); } + cilpcp->space_reserved += ctx_res; /* * Accurately account when over the soft limit, otherwise fold the @@ -632,14 +636,12 @@ xlog_cil_insert_items( } put_cpu_ptr(cilpcp); - spin_lock(&cil->xc_cil_lock); - ctx->ticket->t_curr_res += ctx_res; - /* * Now (re-)position everything modified at the tail of the CIL. * We do this here so we only need to take the CIL lock once during * the transaction commit. */ + spin_lock(&cil->xc_cil_lock); list_for_each_entry(lip, &tp->t_items, li_trans) { /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) @@ -1746,9 +1748,15 @@ xlog_cil_pcp_dead( { struct xfs_cil *cil = log->l_cilp; struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + struct xfs_cil_ctx *ctx; down_write(&cil->xc_ctx_lock); - atomic_add(cilpcp->space_used, &cil->xc_ctx->space_used); + ctx = cil->xc_ctx; + if (ctx->ticket) + ctx->ticket->t_curr_res += cilpcp->space_reserved; + cilpcp->space_reserved = 0; + + atomic_add(cilpcp->space_used, &ctx->space_used); cilpcp->space_used = 0; up_write(&cil->xc_ctx_lock); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f4c13704ef8c..05a5668d8789 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -236,6 +236,7 @@ struct xfs_cil_ctx { */ struct xlog_cil_pcp { int32_t space_used; + uint32_t space_reserved; struct list_head busy_extents; struct list_head log_items; }; -- cgit v1.2.3 From df7a4a2134b0a201c93e96efe4bb2be747f9da9f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:52:59 +1000 Subject: xfs: convert CIL busy extents to per-cpu To get them out from under the CIL lock. This is an unordered list, so we can simply punt it to per-cpu lists during transaction commits and reaggregate it back into a single list during the CIL push work. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index e38e10082da2..f02a75d5a03e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -128,6 +128,11 @@ xlog_cil_push_pcp_aggregate( ctx->ticket->t_curr_res += cilpcp->space_reserved; cilpcp->space_reserved = 0; + if (!list_empty(&cilpcp->busy_extents)) { + list_splice_init(&cilpcp->busy_extents, + &ctx->busy_extents); + } + /* * We're in the middle of switching cil contexts. Reset the * counter we use to detect when the current context is nearing @@ -634,6 +639,9 @@ xlog_cil_insert_items( } else { cilpcp->space_used += len; } + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &cilpcp->busy_extents); put_cpu_ptr(cilpcp); /* @@ -656,9 +664,6 @@ xlog_cil_insert_items( list_move_tail(&lip->li_cil, &cil->xc_cil); } - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) - list_splice_init(&tp->t_busy, &ctx->busy_extents); spin_unlock(&cil->xc_cil_lock); /* @@ -1756,6 +1761,8 @@ xlog_cil_pcp_dead( ctx->ticket->t_curr_res += cilpcp->space_reserved; cilpcp->space_reserved = 0; + if (!list_empty(&cilpcp->busy_extents)) + list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); atomic_add(cilpcp->space_used, &ctx->space_used); cilpcp->space_used = 0; up_write(&cil->xc_ctx_lock); @@ -1766,10 +1773,12 @@ xlog_cil_pcp_dead( */ int xlog_cil_init( - struct xlog *log) + struct xlog *log) { - struct xfs_cil *cil; - struct xfs_cil_ctx *ctx; + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + struct xlog_cil_pcp *cilpcp; + int cpu; cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) @@ -1789,6 +1798,11 @@ xlog_cil_init( if (!cil->xc_pcp) goto out_destroy_wq; + for_each_possible_cpu(cpu) { + cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); + INIT_LIST_HEAD(&cilpcp->busy_extents); + } + INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); -- cgit v1.2.3 From 016a23388cdcb2740deb1379dc408f21c84efb11 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:53:59 +1000 Subject: xfs: Add order IDs to log items in CIL Before we split the ordered CIL up into per cpu lists, we need a mechanism to track the order of the items in the CIL. We need to do this because there are rules around the order in which related items must physically appear in the log even inside a single checkpoint transaction. An example of this is intents - an intent must appear in the log before it's intent done record so that log recovery can cancel the intent correctly. If we have these two records misordered in the CIL, then they will not be recovered correctly by journal replay. We also will not be able to move items to the tail of the CIL list when they are relogged, hence the log items will need some mechanism to allow the correct log item order to be recreated before we write log items to the hournal. Hence we need to have a mechanism for recording global order of transactions in the log items so that we can recover that order from un-ordered per-cpu lists. Do this with a simple monotonic increasing commit counter in the CIL context. Each log item in the transaction gets stamped with the current commit order ID before it is added to the CIL. If the item is already in the CIL, leave it where it is instead of moving it to the tail of the list and instead sort the list before we start the push work. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 39 +++++++++++++++++++++++++++++++-------- fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_trans.h | 1 + 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f02a75d5a03e..6bc540898e3a 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -550,6 +550,7 @@ xlog_cil_insert_items( int len = 0; int iovhdr_res = 0, split_res = 0, ctx_res = 0; int space_used; + int order; struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -645,23 +646,22 @@ xlog_cil_insert_items( put_cpu_ptr(cilpcp); /* - * Now (re-)position everything modified at the tail of the CIL. + * Now update the order of everything modified in the transaction + * and insert items into the CIL if they aren't already there. * We do this here so we only need to take the CIL lock once during * the transaction commit. */ + order = atomic_inc_return(&ctx->order_id); spin_lock(&cil->xc_cil_lock); list_for_each_entry(lip, &tp->t_items, li_trans) { /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; - /* - * Only move the item if it isn't already at the tail. This is - * to prevent a transient list_empty() state when reinserting - * an item that is already the only item in the CIL. - */ - if (!list_is_last(&lip->li_cil, &cil->xc_cil)) - list_move_tail(&lip->li_cil, &cil->xc_cil); + lip->li_order_id = order; + if (!list_empty(&lip->li_cil)) + continue; + list_add_tail(&lip->li_cil, &cil->xc_cil); } spin_unlock(&cil->xc_cil_lock); @@ -1082,6 +1082,26 @@ xlog_cil_build_trans_hdr( tic->t_curr_res -= lvhdr->lv_bytes; } +/* + * CIL item reordering compare function. We want to order in ascending ID order, + * but we want to leave items with the same ID in the order they were added to + * the list. This is important for operations like reflink where we log 4 order + * dependent intents in a single transaction when we overwrite an existing + * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop), + * CUI (inc), BUI(remap)... + */ +static int +xlog_cil_order_cmp( + void *priv, + const struct list_head *a, + const struct list_head *b) +{ + struct xfs_log_item *l1 = container_of(a, struct xfs_log_item, li_cil); + struct xfs_log_item *l2 = container_of(b, struct xfs_log_item, li_cil); + + return l1->li_order_id > l2->li_order_id; +} + /* * Pull all the log vectors off the items in the CIL, and remove the items from * the CIL. We don't need the CIL lock here because it's only needed on the @@ -1101,6 +1121,8 @@ xlog_cil_build_lv_chain( { struct xfs_log_vec *lv = NULL; + list_sort(NULL, &cil->xc_cil, xlog_cil_order_cmp); + while (!list_empty(&cil->xc_cil)) { struct xfs_log_item *item; @@ -1114,6 +1136,7 @@ xlog_cil_build_lv_chain( } list_del_init(&item->li_cil); + item->li_order_id = 0; if (!ctx->lv_chain) ctx->lv_chain = item->li_lv; else diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 05a5668d8789..563145ea0f7d 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -229,6 +229,7 @@ struct xfs_cil_ctx { struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; struct work_struct push_work; + atomic_t order_id; }; /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9561f193e7e1..29927ceecf82 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -45,6 +45,7 @@ struct xfs_log_item { struct xfs_log_vec *li_lv; /* active log vector */ struct xfs_log_vec *li_lv_shadow; /* standby vector */ xfs_csn_t li_seq; /* CIL commit seq */ + uint32_t li_order_id; /* CIL commit order */ }; /* -- cgit v1.2.3 From c0fb4765c5086cfd00f1158f5f44e7e1906530ad Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:54:59 +1000 Subject: xfs: convert CIL to unordered per cpu lists So that we can remove the cil_lock which is a global serialisation point. We've already got ordering sorted, so all we need to do is treat the CIL list like the busy extent list and reconstruct it before the push starts. This is what we're trying to avoid: - 75.35% 1.83% [kernel] [k] xfs_log_commit_cil - 46.35% xfs_log_commit_cil - 41.54% _raw_spin_lock - 67.30% do_raw_spin_lock 66.96% __pv_queued_spin_lock_slowpath Which happens on a 32p system when running a 32-way 'rm -rf' workload. After this patch: - 20.90% 3.23% [kernel] [k] xfs_log_commit_cil - 17.67% xfs_log_commit_cil - 6.51% xfs_log_ticket_ungrant 1.40% xfs_log_space_wake 2.32% memcpy_erms - 2.18% xfs_buf_item_committing - 2.12% xfs_buf_item_release - 1.03% xfs_buf_unlock 0.96% up 0.72% xfs_buf_rele 1.33% xfs_inode_item_format 1.19% down_read 0.91% up_read 0.76% xfs_buf_item_format - 0.68% kmem_alloc_large - 0.67% kmem_alloc 0.64% __kmalloc 0.50% xfs_buf_item_size It kinda looks like the workload is running out of log space all the time. But all the spinlock contention is gone and the transaction commit rate has gone from 800k/s to 1.3M/s so the amount of real work being done has gone up a *lot*. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 35 ++++++++++++++++------------------- fs/xfs/xfs_log_priv.h | 3 +-- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 6bc540898e3a..a0792b8db38b 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -104,6 +104,7 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->log_items); INIT_WORK(&ctx->push_work, xlog_cil_push_work); return ctx; } @@ -132,6 +133,8 @@ xlog_cil_push_pcp_aggregate( list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); } + if (!list_empty(&cilpcp->log_items)) + list_splice_init(&cilpcp->log_items, &ctx->log_items); /* * We're in the middle of switching cil contexts. Reset the @@ -579,10 +582,9 @@ xlog_cil_insert_items( /* * We need to take the CIL checkpoint unit reservation on the first * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't - * unnecessarily do an atomic op in the fast path here. We don't need to - * hold the xc_cil_lock here to clear the XLOG_CIL_EMPTY bit as we are - * under the xc_ctx_lock here and that needs to be held exclusively to - * reset the XLOG_CIL_EMPTY bit. + * unnecessarily do an atomic op in the fast path here. We can clear the + * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that + * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit. */ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) && test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) @@ -643,7 +645,6 @@ xlog_cil_insert_items( /* attach the transaction to the CIL if it has any busy extents */ if (!list_empty(&tp->t_busy)) list_splice_init(&tp->t_busy, &cilpcp->busy_extents); - put_cpu_ptr(cilpcp); /* * Now update the order of everything modified in the transaction @@ -652,7 +653,6 @@ xlog_cil_insert_items( * the transaction commit. */ order = atomic_inc_return(&ctx->order_id); - spin_lock(&cil->xc_cil_lock); list_for_each_entry(lip, &tp->t_items, li_trans) { /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) @@ -661,10 +661,9 @@ xlog_cil_insert_items( lip->li_order_id = order; if (!list_empty(&lip->li_cil)) continue; - list_add_tail(&lip->li_cil, &cil->xc_cil); + list_add_tail(&lip->li_cil, &cilpcp->log_items); } - - spin_unlock(&cil->xc_cil_lock); + put_cpu_ptr(cilpcp); /* * If we've overrun the reservation, dump the tx details before we move @@ -1113,7 +1112,6 @@ xlog_cil_order_cmp( */ static void xlog_cil_build_lv_chain( - struct xfs_cil *cil, struct xfs_cil_ctx *ctx, struct list_head *whiteouts, uint32_t *num_iovecs, @@ -1121,12 +1119,12 @@ xlog_cil_build_lv_chain( { struct xfs_log_vec *lv = NULL; - list_sort(NULL, &cil->xc_cil, xlog_cil_order_cmp); + list_sort(NULL, &ctx->log_items, xlog_cil_order_cmp); - while (!list_empty(&cil->xc_cil)) { + while (!list_empty(&ctx->log_items)) { struct xfs_log_item *item; - item = list_first_entry(&cil->xc_cil, + item = list_first_entry(&ctx->log_items, struct xfs_log_item, li_cil); if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) { @@ -1265,7 +1263,7 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes); + xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out @@ -1409,7 +1407,6 @@ xlog_cil_push_background( * The cil won't be empty because we are called while holding the * context lock so whatever we added to the CIL will still be there. */ - ASSERT(!list_empty(&cil->xc_cil)); ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); /* @@ -1656,7 +1653,7 @@ xlog_cil_flush( * If the CIL is empty, make sure that any previous checkpoint that may * still be in an active iclog is pushed to stable storage. */ - if (list_empty(&log->l_cilp->xc_cil)) + if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags)) xfs_log_force(log->l_mp, 0); } @@ -1784,6 +1781,8 @@ xlog_cil_pcp_dead( ctx->ticket->t_curr_res += cilpcp->space_reserved; cilpcp->space_reserved = 0; + if (!list_empty(&cilpcp->log_items)) + list_splice_init(&cilpcp->log_items, &ctx->log_items); if (!list_empty(&cilpcp->busy_extents)) list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); atomic_add(cilpcp->space_used, &ctx->space_used); @@ -1824,11 +1823,10 @@ xlog_cil_init( for_each_possible_cpu(cpu) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); INIT_LIST_HEAD(&cilpcp->busy_extents); + INIT_LIST_HEAD(&cilpcp->log_items); } - INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); - spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); @@ -1859,7 +1857,6 @@ xlog_cil_destroy( kmem_free(cil->xc_ctx); } - ASSERT(list_empty(&cil->xc_cil)); ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); free_percpu(cil->xc_pcp); destroy_workqueue(cil->xc_push_wq); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 563145ea0f7d..f00f11c18116 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -224,6 +224,7 @@ struct xfs_cil_ctx { struct xlog_ticket *ticket; /* chkpt ticket */ atomic_t space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ + struct list_head log_items; /* log items in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ @@ -262,8 +263,6 @@ struct xfs_cil { struct xlog *xc_log; unsigned long xc_flags; atomic_t xc_iclog_hdrs; - struct list_head xc_cil; - spinlock_t xc_cil_lock; struct workqueue_struct *xc_push_wq; struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; -- cgit v1.2.3 From 169248536a2b28e4228ba63772936c1ba979c9c0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:55:59 +1000 Subject: xfs: convert log vector chain to use list heads Because the next change is going to require sorting log vectors, and that requires arbitrary rearrangement of the list which cannot be done easily with a single linked list. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 11 +++++----- fs/xfs/xfs_log.h | 2 +- fs/xfs/xfs_log_cil.c | 54 +++++++++++++++++++++++++++---------------------- fs/xfs/xfs_log_priv.h | 4 ++-- fs/xfs/xfs_trans.c | 4 ++-- fs/xfs/xfs_trans_priv.h | 3 ++- 6 files changed, 43 insertions(+), 35 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e4416e80a25f..296f6ac43448 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -944,6 +944,8 @@ xlog_write_unmount_record( .lv_niovecs = 1, .lv_iovecp = ®, }; + LIST_HEAD(lv_chain); + list_add(&vec.lv_list, &lv_chain); BUILD_BUG_ON((sizeof(struct xlog_op_header) + sizeof(struct xfs_unmount_log_format)) != @@ -952,7 +954,7 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(unmount_rec); - return xlog_write(log, NULL, &vec, ticket, reg.i_len); + return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len); } /* @@ -2471,13 +2473,13 @@ int xlog_write( struct xlog *log, struct xfs_cil_ctx *ctx, - struct xfs_log_vec *log_vector, + struct list_head *lv_chain, struct xlog_ticket *ticket, uint32_t len) { struct xlog_in_core *iclog = NULL; - struct xfs_log_vec *lv = log_vector; + struct xfs_log_vec *lv; uint32_t record_cnt = 0; uint32_t data_cnt = 0; int error = 0; @@ -2505,7 +2507,7 @@ xlog_write( if (ctx) xlog_cil_set_ctx_write_state(ctx, iclog); - while (lv) { + list_for_each_entry(lv, lv_chain, lv_list) { /* * If the entire log vec does not fit in the iclog, punt it to * the partial copy loop which can handle this case. @@ -2526,7 +2528,6 @@ xlog_write( xlog_write_full(lv, ticket, iclog, &log_offset, &len, &record_cnt, &data_cnt); } - lv = lv->lv_next; } ASSERT(len == 0); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index f3ce046a7d45..e7bf3c780cb4 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -9,7 +9,7 @@ struct xfs_cil_ctx; struct xfs_log_vec { - struct xfs_log_vec *lv_next; /* next lv in build list */ + struct list_head lv_list; /* CIL lv chain ptrs */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index a0792b8db38b..0e69e855d710 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -105,6 +105,7 @@ xlog_cil_ctx_alloc(void) INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents); INIT_LIST_HEAD(&ctx->log_items); + INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); return ctx; } @@ -338,6 +339,7 @@ xlog_cil_alloc_shadow_bufs( memset(lv, 0, xlog_cil_iovec_space(niovecs)); + INIT_LIST_HEAD(&lv->lv_list); lv->lv_item = lip; lv->lv_size = buf_size; if (ordered) @@ -353,7 +355,6 @@ xlog_cil_alloc_shadow_bufs( else lv->lv_buf_len = 0; lv->lv_bytes = 0; - lv->lv_next = NULL; } /* Ensure the lv is set up according to ->iop_size */ @@ -480,7 +481,6 @@ xlog_cil_insert_format_items( if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; - lv->lv_next = NULL; if (ordered) goto insert; @@ -685,14 +685,14 @@ xlog_cil_insert_items( static void xlog_cil_free_logvec( - struct xfs_log_vec *log_vector) + struct list_head *lv_chain) { struct xfs_log_vec *lv; - for (lv = log_vector; lv; ) { - struct xfs_log_vec *next = lv->lv_next; + while (!list_empty(lv_chain)) { + lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list); + list_del_init(&lv->lv_list); kmem_free(lv); - lv = next; } } @@ -792,7 +792,7 @@ xlog_cil_committed( spin_unlock(&ctx->cil->xc_push_lock); } - xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); xfs_extent_busy_sort(&ctx->busy_extents); @@ -803,7 +803,7 @@ xlog_cil_committed( list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); - xlog_cil_free_logvec(ctx->lv_chain); + xlog_cil_free_logvec(&ctx->lv_chain); if (!list_empty(&ctx->busy_extents)) xlog_discard_busy_extents(mp, ctx); @@ -962,7 +962,6 @@ restart: static int xlog_cil_write_chain( struct xfs_cil_ctx *ctx, - struct xfs_log_vec *chain, uint32_t chain_len) { struct xlog *log = ctx->cil->xc_log; @@ -971,7 +970,7 @@ xlog_cil_write_chain( error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; - return xlog_write(log, ctx, chain, ctx->ticket, chain_len); + return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len); } /* @@ -1000,6 +999,8 @@ xlog_cil_write_commit_record( .lv_iovecp = ®, }; int error; + LIST_HEAD(lv_chain); + list_add(&vec.lv_list, &lv_chain); if (xlog_is_shutdown(log)) return -EIO; @@ -1010,7 +1011,7 @@ xlog_cil_write_commit_record( /* account for space used by record data */ ctx->ticket->t_curr_res -= reg.i_len; - error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len); + error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; @@ -1076,7 +1077,6 @@ xlog_cil_build_trans_hdr( lvhdr->lv_niovecs = 2; lvhdr->lv_iovecp = &hdr->lhdr[0]; lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; - lvhdr->lv_next = ctx->lv_chain; tic->t_curr_res -= lvhdr->lv_bytes; } @@ -1117,12 +1117,11 @@ xlog_cil_build_lv_chain( uint32_t *num_iovecs, uint32_t *num_bytes) { - struct xfs_log_vec *lv = NULL; - list_sort(NULL, &ctx->log_items, xlog_cil_order_cmp); while (!list_empty(&ctx->log_items)) { struct xfs_log_item *item; + struct xfs_log_vec *lv; item = list_first_entry(&ctx->log_items, struct xfs_log_item, li_cil); @@ -1133,19 +1132,17 @@ xlog_cil_build_lv_chain( continue; } - list_del_init(&item->li_cil); - item->li_order_id = 0; - if (!ctx->lv_chain) - ctx->lv_chain = item->li_lv; - else - lv->lv_next = item->li_lv; lv = item->li_lv; - item->li_lv = NULL; - *num_iovecs += lv->lv_niovecs; /* we don't write ordered log vectors */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *num_bytes += lv->lv_bytes; + *num_iovecs += lv->lv_niovecs; + list_add_tail(&lv->lv_list, &ctx->lv_chain); + + list_del_init(&item->li_cil); + item->li_order_id = 0; + item->li_lv = NULL; } } @@ -1189,7 +1186,7 @@ xlog_cil_push_work( int num_bytes = 0; int error = 0; struct xlog_cil_trans_hdr thdr; - struct xfs_log_vec lvhdr = { NULL }; + struct xfs_log_vec lvhdr = {}; xfs_csn_t push_seq; bool push_commit_stable; LIST_HEAD (whiteouts); @@ -1299,11 +1296,20 @@ xlog_cil_push_work( * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the * transaction header here as it is not accounted for in xlog_write(). + * Add the lvhdr to the head of the lv chain we pass to xlog_write() so + * it gets written into the iclog first. */ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); num_bytes += lvhdr.lv_bytes; + list_add(&lvhdr.lv_list, &ctx->lv_chain); - error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes); + /* + * Take the lvhdr back off the lv_chain immediately after calling + * xlog_cil_write_chain() as it should not be passed to log IO + * completion. + */ + error = xlog_cil_write_chain(ctx, num_bytes); + list_del(&lvhdr.lv_list); if (error) goto out_abort_free_ticket; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f00f11c18116..d4270a2d4ffb 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -225,7 +225,7 @@ struct xfs_cil_ctx { atomic_t space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct list_head log_items; /* log items in chkpt */ - struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; @@ -508,7 +508,7 @@ struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes, void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, - struct xfs_log_vec *log_vector, struct xlog_ticket *tic, + struct list_head *lv_chain, struct xlog_ticket *tic, uint32_t len); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 82cf0189c0db..ec347717ce78 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -760,7 +760,7 @@ xfs_log_item_batch_insert( void xfs_trans_committed_bulk( struct xfs_ail *ailp, - struct xfs_log_vec *log_vector, + struct list_head *lv_chain, xfs_lsn_t commit_lsn, bool aborted) { @@ -775,7 +775,7 @@ xfs_trans_committed_bulk( spin_unlock(&ailp->ail_lock); /* unpin all the log items */ - for (lv = log_vector; lv; lv = lv->lv_next ) { + list_for_each_entry(lv, lv_chain, lv_list) { struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index f0d79a9050ba..d5400150358e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -19,7 +19,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); -void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, +void xfs_trans_committed_bulk(struct xfs_ail *ailp, + struct list_head *lv_chain, xfs_lsn_t commit_lsn, bool aborted); /* * AIL traversal cursor. -- cgit v1.2.3 From 4eb56069cb2835fafe569e27e746e6a4c9735186 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:56:08 +1000 Subject: xfs: move CIL ordering to the logvec chain Adding a list_sort() call to the CIL push work while the xc_ctx_lock is held exclusively has resulted in fairly long lock hold times and that stops all front end transaction commits from making progress. We can move the sorting out of the xc_ctx_lock if we can transfer the ordering information to the log vectors as they are detached from the log items and then we can sort the log vectors. With these changes, we can move the list_sort() call to just before we call xlog_write() when we aren't holding any locks at all. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_log_cil.c | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index e7bf3c780cb4..2728886c2963 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -10,6 +10,7 @@ struct xfs_cil_ctx; struct xfs_log_vec { struct list_head lv_list; /* CIL lv chain ptrs */ + uint32_t lv_order_id; /* chain ordering info */ int lv_niovecs; /* number of iovecs in lv */ struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 0e69e855d710..8bb251d2b4d3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1095,10 +1095,10 @@ xlog_cil_order_cmp( const struct list_head *a, const struct list_head *b) { - struct xfs_log_item *l1 = container_of(a, struct xfs_log_item, li_cil); - struct xfs_log_item *l2 = container_of(b, struct xfs_log_item, li_cil); + struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list); + struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list); - return l1->li_order_id > l2->li_order_id; + return l1->lv_order_id > l2->lv_order_id; } /* @@ -1117,8 +1117,6 @@ xlog_cil_build_lv_chain( uint32_t *num_iovecs, uint32_t *num_bytes) { - list_sort(NULL, &ctx->log_items, xlog_cil_order_cmp); - while (!list_empty(&ctx->log_items)) { struct xfs_log_item *item; struct xfs_log_vec *lv; @@ -1133,6 +1131,7 @@ xlog_cil_build_lv_chain( } lv = item->li_lv; + lv->lv_order_id = item->li_order_id; /* we don't write ordered log vectors */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) @@ -1292,6 +1291,13 @@ xlog_cil_push_work( spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); + /* + * Sort the log vector chain before we add the transaction headers. + * This ensures we always have the transaction headers at the start + * of the chain. + */ + list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp); + /* * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the -- cgit v1.2.3 From 1ccb0745a97fb8b38913b39b8ecb1aea39fdbcb0 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:56:08 +1000 Subject: xfs: avoid cil push lock if possible Because now it hurts when the CIL fills up. - 37.20% __xfs_trans_commit - 35.84% xfs_log_commit_cil - 19.34% _raw_spin_lock - do_raw_spin_lock 19.01% __pv_queued_spin_lock_slowpath - 4.20% xfs_log_ticket_ungrant 0.90% xfs_log_space_wake Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 8bb251d2b4d3..78f8537860df 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1422,10 +1422,18 @@ xlog_cil_push_background( ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); /* - * Don't do a background push if we haven't used up all the - * space available yet. + * We are done if: + * - we haven't used up all the space available yet; or + * - we've already queued up a push; and + * - we're not over the hard limit; and + * - nothing has been over the hard limit. + * + * If so, we don't need to take the push lock as there's nothing to do. */ - if (space_used < XLOG_CIL_SPACE_LIMIT(log)) { + if (space_used < XLOG_CIL_SPACE_LIMIT(log) || + (cil->xc_push_seq == cil->xc_current_sequence && + space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) && + !waitqueue_active(&cil->xc_push_wait))) { up_read(&cil->xc_ctx_lock); return; } -- cgit v1.2.3 From d9f68777b2515452828d97b521ff8e3517b42eb1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:56:09 +1000 Subject: xfs: xlog_sync() manually adjusts grant head space When xlog_sync() rounds off the tail the iclog that is being flushed, it manually subtracts that space from the grant heads. This space is actually reserved by the transaction ticket that covers the xlog_sync() call from xlog_write(), but we don't plumb the ticket down far enough for it to account for the space consumed in the current log ticket. The grant heads are hot, so we really should be accounting this to the ticket is we can, rather than adding thousands of extra grant head updates every CIL commit. Interestingly, this actually indicates a potential log space overrun can occur when we force the log. By the time that xfs_log_force() pushes out an active iclog and consumes the roundoff space, the reservation for that roundoff space has been returned to the grant heads and is no longer covered by a reservation. In theory the roundoff added to log force on an already full log could push the write head past the tail. In practice, the CIL commit that writes to the log and needs the iclog pushed will have reserved space for roundoff, so when it releases the ticket there will still be physical space for the roundoff to be committed to the log, even though it is no longer reserved. This roundoff won't be enough space to allow a transaction to be woken if the log is full, so overruns should not actually occur in practice. That said, it indicates that we should not release the CIL context log ticket until after we've released the commit iclog. It also means that xlog_sync() still needs the direct grant head manipulation if we don't provide it with a ticket. Log forces are rare when we are in fast paths running 1.5 million transactions/s that make the grant heads hot, so let's optimise the hot case and pass CIL log tickets down to the xlog_sync() code. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 35 +++++++++++++++++++++++------------ fs/xfs/xfs_log_cil.c | 20 ++++++++++++++++---- fs/xfs/xfs_log_priv.h | 3 ++- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 296f6ac43448..498cbb49392b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -57,7 +57,8 @@ xlog_grant_push_ail( STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog); + struct xlog_in_core *iclog, + struct xlog_ticket *ticket); #if defined(DEBUG) STATIC void xlog_verify_grant_tail( @@ -567,7 +568,8 @@ xlog_state_shutdown_callbacks( int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + struct xlog_ticket *ticket) { xfs_lsn_t tail_lsn; bool last_ref; @@ -614,7 +616,7 @@ xlog_state_release_iclog( trace_xlog_iclog_syncing(iclog, _RET_IP_); spin_unlock(&log->l_icloglock); - xlog_sync(log, iclog); + xlog_sync(log, iclog, ticket); spin_lock(&log->l_icloglock); return 0; } @@ -881,7 +883,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog); + return xlog_state_release_iclog(iclog->ic_log, iclog, NULL); } /* @@ -2027,7 +2029,8 @@ xlog_calc_iclog_size( STATIC void xlog_sync( struct xlog *log, - struct xlog_in_core *iclog) + struct xlog_in_core *iclog, + struct xlog_ticket *ticket) { unsigned int count; /* byte count of bwrite */ unsigned int roundoff; /* roundoff to BB or stripe */ @@ -2039,12 +2042,20 @@ xlog_sync( count = xlog_calc_iclog_size(log, iclog, &roundoff); - /* move grant heads by roundoff in sync */ - xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); - xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + /* + * If we have a ticket, account for the roundoff via the ticket + * reservation to avoid touching the hot grant heads needlessly. + * Otherwise, we have to move grant heads directly. + */ + if (ticket) { + ticket->t_curr_res -= roundoff; + } else { + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); + } /* put cycle number in every block */ - xlog_pack_data(log, iclog, roundoff); + xlog_pack_data(log, iclog, roundoff); /* real byte length */ size = iclog->ic_offset; @@ -2277,7 +2288,7 @@ xlog_write_get_more_iclog_space( spin_lock(&log->l_icloglock); ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC); xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); if (error) return error; @@ -2539,7 +2550,7 @@ xlog_write( */ spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, 0); - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); return error; @@ -2959,7 +2970,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog); + error = xlog_state_release_iclog(log, iclog, ticket); spin_unlock(&log->l_icloglock); if (error) return error; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 78f8537860df..eccbfb99e894 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -1189,6 +1189,7 @@ xlog_cil_push_work( xfs_csn_t push_seq; bool push_commit_stable; LIST_HEAD (whiteouts); + struct xlog_ticket *ticket; new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); @@ -1323,7 +1324,14 @@ xlog_cil_push_work( if (error) goto out_abort_free_ticket; - xfs_log_ticket_ungrant(log, ctx->ticket); + /* + * Grab the ticket from the ctx so we can ungrant it after releasing the + * commit_iclog. The ctx may be freed by the time we return from + * releasing the commit_iclog (i.e. checkpoint has been completed and + * callback run) so we can't reference the ctx after the call to + * xlog_state_release_iclog(). + */ + ticket = ctx->ticket; /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs @@ -1373,12 +1381,14 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog); + ticket = ctx->ticket; + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); xlog_cil_cleanup_whiteouts(&whiteouts); + xfs_log_ticket_ungrant(log, ticket); return; out_skip: @@ -1388,17 +1398,19 @@ out_skip: return; out_abort_free_ticket: - xfs_log_ticket_ungrant(log, ctx->ticket); ASSERT(xlog_is_shutdown(log)); xlog_cil_cleanup_whiteouts(&whiteouts); if (!ctx->commit_iclog) { + xfs_log_ticket_ungrant(log, ctx->ticket); xlog_cil_committed(ctx); return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog); + ticket = ctx->ticket; + xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); + xfs_log_ticket_ungrant(log, ticket); } /* diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index d4270a2d4ffb..1bd2963e8fbd 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -515,7 +515,8 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); -int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, + struct xlog_ticket *ticket); /* * When we crack an atomic LSN, we sample it first so that the value will not -- cgit v1.2.3 From 51a117edff133a1ea8cb0fcbc599b8d5a34414e9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 18:56:09 +1000 Subject: xfs: expanding delayed logging design with background material I wrote up a description of how transactions, space reservations and relogging work together in response to a question for background material on the delayed logging design. Add this to the existing document for ease of future reference. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- .../filesystems/xfs-delayed-logging-design.rst | 361 ++++++++++++++++++--- 1 file changed, 322 insertions(+), 39 deletions(-) diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs-delayed-logging-design.rst index 464405d2801e..4ef419f54663 100644 --- a/Documentation/filesystems/xfs-delayed-logging-design.rst +++ b/Documentation/filesystems/xfs-delayed-logging-design.rst @@ -1,29 +1,314 @@ .. SPDX-License-Identifier: GPL-2.0 -========================== -XFS Delayed Logging Design -========================== - -Introduction to Re-logging in XFS -================================= - -XFS logging is a combination of logical and physical logging. Some objects, -such as inodes and dquots, are logged in logical format where the details -logged are made up of the changes to in-core structures rather than on-disk -structures. Other objects - typically buffers - have their physical changes -logged. The reason for these differences is to reduce the amount of log space -required for objects that are frequently logged. Some parts of inodes are more -frequently logged than others, and inodes are typically more frequently logged -than any other object (except maybe the superblock buffer) so keeping the -amount of metadata logged low is of prime importance. - -The reason that this is such a concern is that XFS allows multiple separate -modifications to a single object to be carried in the log at any given time. -This allows the log to avoid needing to flush each change to disk before -recording a new change to the object. XFS does this via a method called -"re-logging". Conceptually, this is quite simple - all it requires is that any -new change to the object is recorded with a *new copy* of all the existing -changes in the new transaction that is written to the log. +================== +XFS Logging Design +================== + +Preamble +======== + +This document describes the design and algorithms that the XFS journalling +subsystem is based on. This document describes the design and algorithms that +the XFS journalling subsystem is based on so that readers may familiarize +themselves with the general concepts of how transaction processing in XFS works. + +We begin with an overview of transactions in XFS, followed by describing how +transaction reservations are structured and accounted, and then move into how we +guarantee forwards progress for long running transactions with finite initial +reservations bounds. At this point we need to explain how relogging works. With +the basic concepts covered, the design of the delayed logging mechanism is +documented. + + +Introduction +============ + +XFS uses Write Ahead Logging for ensuring changes to the filesystem metadata +are atomic and recoverable. For reasons of space and time efficiency, the +logging mechanisms are varied and complex, combining intents, logical and +physical logging mechanisms to provide the necessary recovery guarantees the +filesystem requires. + +Some objects, such as inodes and dquots, are logged in logical format where the +details logged are made up of the changes to in-core structures rather than +on-disk structures. Other objects - typically buffers - have their physical +changes logged. Long running atomic modifications have individual changes +chained together by intents, ensuring that journal recovery can restart and +finish an operation that was only partially done when the system stopped +functioning. + +The reason for these differences is to keep the amount of log space and CPU time +required to process objects being modified as small as possible and hence the +logging overhead as low as possible. Some items are very frequently modified, +and some parts of objects are more frequently modified than others, so keeping +the overhead of metadata logging low is of prime importance. + +The method used to log an item or chain modifications together isn't +particularly important in the scope of this document. It suffices to know that +the method used for logging a particular object or chaining modifications +together are different and are dependent on the object and/or modification being +performed. The logging subsystem only cares that certain specific rules are +followed to guarantee forwards progress and prevent deadlocks. + + +Transactions in XFS +=================== + +XFS has two types of high level transactions, defined by the type of log space +reservation they take. These are known as "one shot" and "permanent" +transactions. Permanent transaction reservations can take reservations that span +commit boundaries, whilst "one shot" transactions are for a single atomic +modification. + +The type and size of reservation must be matched to the modification taking +place. This means that permanent transactions can be used for one-shot +modifications, but one-shot reservations cannot be used for permanent +transactions. + +In the code, a one-shot transaction pattern looks somewhat like this:: + + tp = xfs_trans_alloc() + + + + xfs_trans_commit(tp); + +As items are modified in the transaction, the dirty regions in those items are +tracked via the transaction handle. Once the transaction is committed, all +resources joined to it are released, along with the remaining unused reservation +space that was taken at the transaction allocation time. + +In contrast, a permanent transaction is made up of multiple linked individual +transactions, and the pattern looks like this:: + + tp = xfs_trans_alloc() + xfs_ilock(ip, XFS_ILOCK_EXCL) + + loop { + xfs_trans_ijoin(tp, 0); + + xfs_trans_log_inode(tp, ip); + xfs_trans_roll(&tp); + } + + xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + +While this might look similar to a one-shot transaction, there is an important +difference: xfs_trans_roll() performs a specific operation that links two +transactions together:: + + ntp = xfs_trans_dup(tp); + xfs_trans_commit(tp); + xfs_log_reserve(ntp); + +This results in a series of "rolling transactions" where the inode is locked +across the entire chain of transactions. Hence while this series of rolling +transactions is running, nothing else can read from or write to the inode and +this provides a mechanism for complex changes to appear atomic from an external +observer's point of view. + +It is important to note that a series of rolling transactions in a permanent +transaction does not form an atomic change in the journal. While each +individual modification is atomic, the chain is *not atomic*. If we crash half +way through, then recovery will only replay up to the last transactional +modification the loop made that was committed to the journal. + +This affects long running permanent transactions in that it is not possible to +predict how much of a long running operation will actually be recovered because +there is no guarantee of how much of the operation reached stale storage. Hence +if a long running operation requires multiple transactions to fully complete, +the high level operation must use intents and deferred operations to guarantee +recovery can complete the operation once the first transactions is persisted in +the on-disk journal. + + +Transactions are Asynchronous +============================= + +In XFS, all high level transactions are asynchronous by default. This means that +xfs_trans_commit() does not guarantee that the modification has been committed +to stable storage when it returns. Hence when a system crashes, not all the +completed transactions will be replayed during recovery. + +However, the logging subsystem does provide global ordering guarantees, such +that if a specific change is seen after recovery, all metadata modifications +that were committed prior to that change will also be seen. + +For single shot operations that need to reach stable storage immediately, or +ensuring that a long running permanent transaction is fully committed once it is +complete, we can explicitly tag a transaction as synchronous. This will trigger +a "log force" to flush the outstanding committed transactions to stable storage +in the journal and wait for that to complete. + +Synchronous transactions are rarely used, however, because they limit logging +throughput to the IO latency limitations of the underlying storage. Instead, we +tend to use log forces to ensure modifications are on stable storage only when +a user operation requires a synchronisation point to occur (e.g. fsync). + + +Transaction Reservations +======================== + +It has been mentioned a number of times now that the logging subsystem needs to +provide a forwards progress guarantee so that no modification ever stalls +because it can't be written to the journal due to a lack of space in the +journal. This is achieved by the transaction reservations that are made when +a transaction is first allocated. For permanent transactions, these reservations +are maintained as part of the transaction rolling mechanism. + +A transaction reservation provides a guarantee that there is physical log space +available to write the modification into the journal before we start making +modifications to objects and items. As such, the reservation needs to be large +enough to take into account the amount of metadata that the change might need to +log in the worst case. This means that if we are modifying a btree in the +transaction, we have to reserve enough space to record a full leaf-to-root split +of the btree. As such, the reservations are quite complex because we have to +take into account all the hidden changes that might occur. + +For example, a user data extent allocation involves allocating an extent from +free space, which modifies the free space trees. That's two btrees. Inserting +the extent into the inode's extent map might require a split of the extent map +btree, which requires another allocation that can modify the free space trees +again. Then we might have to update reverse mappings, which modifies yet +another btree which might require more space. And so on. Hence the amount of +metadata that a "simple" operation can modify can be quite large. + +This "worst case" calculation provides us with the static "unit reservation" +for the transaction that is calculated at mount time. We must guarantee that the +log has this much space available before the transaction is allowed to proceed +so that when we come to write the dirty metadata into the log we don't run out +of log space half way through the write. + +For one-shot transactions, a single unit space reservation is all that is +required for the transaction to proceed. For permanent transactions, however, we +also have a "log count" that affects the size of the reservation that is to be +made. + +While a permanent transaction can get by with a single unit of space +reservation, it is somewhat inefficient to do this as it requires the +transaction rolling mechanism to re-reserve space on every transaction roll. We +know from the implementation of the permanent transactions how many transaction +rolls are likely for the common modifications that need to be made. + +For example, and inode allocation is typically two transactions - one to +physically allocate a free inode chunk on disk, and another to allocate an inode +from an inode chunk that has free inodes in it. Hence for an inode allocation +transaction, we might set the reservation log count to a value of 2 to indicate +that the common/fast path transaction will commit two linked transactions in a +chain. Each time a permanent transaction rolls, it consumes an entire unit +reservation. + +Hence when the permanent transaction is first allocated, the log space +reservation is increases from a single unit reservation to multiple unit +reservations. That multiple is defined by the reservation log count, and this +means we can roll the transaction multiple times before we have to re-reserve +log space when we roll the transaction. This ensures that the common +modifications we make only need to reserve log space once. + +If the log count for a permanent transaction reaches zero, then it needs to +re-reserve physical space in the log. This is somewhat complex, and requires +an understanding of how the log accounts for space that has been reserved. + + +Log Space Accounting +==================== + +The position in the log is typically referred to as a Log Sequence Number (LSN). +The log is circular, so the positions in the log are defined by the combination +of a cycle number - the number of times the log has been overwritten - and the +offset into the log. A LSN carries the cycle in the upper 32 bits and the +offset in the lower 32 bits. The offset is in units of "basic blocks" (512 +bytes). Hence we can do realtively simple LSN based math to keep track of +available space in the log. + +Log space accounting is done via a pair of constructs called "grant heads". The +position of the grant heads is an absolute value, so the amount of space +available in the log is defined by the distance between the position of the +grant head and the current log tail. That is, how much space can be +reserved/consumed before the grant heads would fully wrap the log and overtake +the tail position. + +The first grant head is the "reserve" head. This tracks the byte count of the +reservations currently held by active transactions. It is a purely in-memory +accounting of the space reservation and, as such, actually tracks byte offsets +into the log rather than basic blocks. Hence it technically isn't using LSNs to +represent the log position, but it is still treated like a split {cycle,offset} +tuple for the purposes of tracking reservation space. + +The reserve grant head is used to accurately account for exact transaction +reservations amounts and the exact byte count that modifications actually make +and need to write into the log. The reserve head is used to prevent new +transactions from taking new reservations when the head reaches the current +tail. It will block new reservations in a FIFO queue and as the log tail moves +forward it will wake them in order once sufficient space is available. This FIFO +mechanism ensures no transaction is starved of resources when log space +shortages occur. + +The other grant head is the "write" head. Unlike the reserve head, this grant +head contains an LSN and it tracks the physical space usage in the log. While +this might sound like it is accounting the same state as the reserve grant head +- and it mostly does track exactly the same location as the reserve grant head - +there are critical differences in behaviour between them that provides the +forwards progress guarantees that rolling permanent transactions require. + +These differences when a permanent transaction is rolled and the internal "log +count" reaches zero and the initial set of unit reservations have been +exhausted. At this point, we still require a log space reservation to continue +the next transaction in the sequeunce, but we have none remaining. We cannot +sleep during the transaction commit process waiting for new log space to become +available, as we may end up on the end of the FIFO queue and the items we have +locked while we sleep could end up pinning the tail of the log before there is +enough free space in the log to fulfil all of the pending reservations and +then wake up transaction commit in progress. + +To take a new reservation without sleeping requires us to be able to take a +reservation even if there is no reservation space currently available. That is, +we need to be able to *overcommit* the log reservation space. As has already +been detailed, we cannot overcommit physical log space. However, the reserve +grant head does not track physical space - it only accounts for the amount of +reservations we currently have outstanding. Hence if the reserve head passes +over the tail of the log all it means is that new reservations will be throttled +immediately and remain throttled until the log tail is moved forward far enough +to remove the overcommit and start taking new reservations. In other words, we +can overcommit the reserve head without violating the physical log head and tail +rules. + +As a result, permanent transactions only "regrant" reservation space during +xfs_trans_commit() calls, while the physical log space reservation - tracked by +the write head - is then reserved separately by a call to xfs_log_reserve() +after the commit completes. Once the commit completes, we can sleep waiting for +physical log space to be reserved from the write grant head, but only if one +critical rule has been observed:: + + Code using permanent reservations must always log the items they hold + locked across each transaction they roll in the chain. + +"Re-logging" the locked items on every transaction roll ensures that the items +attached to the transaction chain being rolled are always relocated to the +physical head of the log and so do not pin the tail of the log. If a locked item +pins the tail of the log when we sleep on the write reservation, then we will +deadlock the log as we cannot take the locks needed to write back that item and +move the tail of the log forwards to free up write grant space. Re-logging the +locked items avoids this deadlock and guarantees that the log reservation we are +making cannot self-deadlock. + +If all rolling transactions obey this rule, then they can all make forwards +progress independently because nothing will block the progress of the log +tail moving forwards and hence ensuring that write grant space is always +(eventually) made available to permanent transactions no matter how many times +they roll. + + +Re-logging Explained +==================== + +XFS allows multiple separate modifications to a single object to be carried in +the log at any given time. This allows the log to avoid needing to flush each +change to disk before recording a new change to the object. XFS does this via a +method called "re-logging". Conceptually, this is quite simple - all it requires +is that any new change to the object is recorded with a *new copy* of all the +existing changes in the new transaction that is written to the log. That is, if we have a sequence of changes A through to F, and the object was written to disk after change D, we would see in the log the following series @@ -42,16 +327,13 @@ transaction:: In other words, each time an object is relogged, the new transaction contains the aggregation of all the previous changes currently held only in the log. -This relogging technique also allows objects to be moved forward in the log so -that an object being relogged does not prevent the tail of the log from ever -moving forward. This can be seen in the table above by the changing -(increasing) LSN of each subsequent transaction - the LSN is effectively a -direct encoding of the location in the log of the transaction. +This relogging technique allows objects to be moved forward in the log so that +an object being relogged does not prevent the tail of the log from ever moving +forward. This can be seen in the table above by the changing (increasing) LSN +of each subsequent transaction, and it's the technique that allows us to +implement long-running, multiple-commit permanent transactions. -This relogging is also used to implement long-running, multiple-commit -transactions. These transaction are known as rolling transactions, and require -a special log reservation known as a permanent transaction reservation. A -typical example of a rolling transaction is the removal of extents from an +A typical example of a rolling transaction is the removal of extents from an inode which can only be done at a rate of two extents per transaction because of reservation size limitations. Hence a rolling extent removal transaction keeps relogging the inode and btree buffers as they get modified in each @@ -67,12 +349,13 @@ the log over and over again. Worse is the fact that objects tend to get dirtier as they get relogged, so each subsequent transaction is writing more metadata into the log. -Another feature of the XFS transaction subsystem is that most transactions are -asynchronous. That is, they don't commit to disk until either a log buffer is -filled (a log buffer can hold multiple transactions) or a synchronous operation -forces the log buffers holding the transactions to disk. This means that XFS is -doing aggregation of transactions in memory - batching them, if you like - to -minimise the impact of the log IO on transaction throughput. +It should now also be obvious how relogging and asynchronous transactions go +hand in hand. That is, transactions don't get written to the physical journal +until either a log buffer is filled (a log buffer can hold multiple +transactions) or a synchronous operation forces the log buffers holding the +transactions to disk. This means that XFS is doing aggregation of transactions +in memory - batching them, if you like - to minimise the impact of the log IO on +transaction throughput. The limitation on asynchronous transaction throughput is the number and size of log buffers made available by the log manager. By default there are 8 log -- cgit v1.2.3 From c6aee2481419b638a5257adbd3ffd33b11c59fa8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:07:09 +1000 Subject: xfs: make last AG grow/shrink perag centric Because the perag must exist for these operations, look it up as part of the common shrink operations and pass it instead of the mount/agno pair. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 51 +++++++++++++++++++++----------------------------- fs/xfs/libxfs/xfs_ag.h | 11 +++++------ fs/xfs/xfs_fsops.c | 11 +++++++---- fs/xfs/xfs_ioctl.c | 8 +++++++- 4 files changed, 40 insertions(+), 41 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 3e920cf1b454..4bb316d89e57 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -761,11 +761,11 @@ xfs_ag_init_headers( int xfs_ag_shrink_space( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans **tpp, - xfs_agnumber_t agno, xfs_extlen_t delta) { + struct xfs_mount *mp = pag->pag_mount; struct xfs_alloc_arg args = { .tp = *tpp, .mp = mp, @@ -782,14 +782,14 @@ xfs_ag_shrink_space( xfs_agblock_t aglen; int error, err2; - ASSERT(agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp); + ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); + error = xfs_ialloc_read_agi(mp, *tpp, pag->pag_agno, &agibp); if (error) return error; agi = agibp->b_addr; - error = xfs_alloc_read_agf(mp, *tpp, agno, 0, &agfbp); + error = xfs_alloc_read_agf(mp, *tpp, pag->pag_agno, 0, &agfbp); if (error) return error; @@ -801,13 +801,14 @@ xfs_ag_shrink_space( if (delta >= aglen) return -EINVAL; - args.fsbno = XFS_AGB_TO_FSB(mp, agno, aglen - delta); + args.fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, aglen - delta); /* * Make sure that the last inode cluster cannot overlap with the new * end of the AG, even if it's sparse. */ - error = xfs_ialloc_check_shrink(*tpp, agno, agibp, aglen - delta); + error = xfs_ialloc_check_shrink(*tpp, pag->pag_agno, agibp, + aglen - delta); if (error) return error; @@ -883,9 +884,8 @@ resv_err: */ int xfs_ag_extend_space( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len) { struct xfs_buf *bp; @@ -893,23 +893,20 @@ xfs_ag_extend_space( struct xfs_agf *agf; int error; - /* - * Change the agi length. - */ - error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp); + ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); + + error = xfs_ialloc_read_agi(pag->pag_mount, tp, pag->pag_agno, &bp); if (error) return error; agi = bp->b_addr; be32_add_cpu(&agi->agi_length, len); - ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || - be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); /* * Change agf length. */ - error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp); + error = xfs_alloc_read_agf(pag->pag_mount, tp, pag->pag_agno, 0, &bp); if (error) return error; @@ -924,13 +921,12 @@ xfs_ag_extend_space( * XFS_RMAP_OINFO_SKIP_UPDATE is used here to tell the rmap btree that * this doesn't actually exist in the rmap btree. */ - error = xfs_rmap_free(tp, bp, bp->b_pag, - be32_to_cpu(agf->agf_length) - len, + error = xfs_rmap_free(tp, bp, pag, be32_to_cpu(agf->agf_length) - len, len, &XFS_RMAP_OINFO_SKIP_UPDATE); if (error) return error; - return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, + return xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, be32_to_cpu(agf->agf_length) - len), len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); @@ -939,34 +935,29 @@ xfs_ag_extend_space( /* Retrieve AG geometry. */ int xfs_ag_get_geometry( - struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, struct xfs_ag_geometry *ageo) { struct xfs_buf *agi_bp; struct xfs_buf *agf_bp; struct xfs_agi *agi; struct xfs_agf *agf; - struct xfs_perag *pag; unsigned int freeblks; int error; - if (agno >= mp->m_sb.sb_agcount) - return -EINVAL; - /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(mp, NULL, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag->pag_mount, NULL, pag->pag_agno, + &agi_bp); if (error) return error; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag->pag_mount, NULL, pag->pag_agno, 0, + &agf_bp); if (error) goto out_agi; - pag = agi_bp->b_pag; - /* Fill out form. */ memset(ageo, 0, sizeof(*ageo)); - ageo->ag_number = agno; + ageo->ag_number = pag->pag_agno; agi = agi_bp->b_addr; ageo->ag_icount = be32_to_cpu(agi->agi_count); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index e411d51c2589..1132cda9a92f 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -168,11 +168,10 @@ struct aghdr_init_data { }; int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); -int xfs_ag_shrink_space(struct xfs_mount *mp, struct xfs_trans **tpp, - xfs_agnumber_t agno, xfs_extlen_t delta); -int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, - struct aghdr_init_data *id, xfs_extlen_t len); -int xfs_ag_get_geometry(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_ag_geometry *ageo); +int xfs_ag_shrink_space(struct xfs_perag *pag, struct xfs_trans **tpp, + xfs_extlen_t delta); +int xfs_ag_extend_space(struct xfs_perag *pag, struct xfs_trans *tp, + xfs_extlen_t len); +int xfs_ag_get_geometry(struct xfs_perag *pag, struct xfs_ag_geometry *ageo); #endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index d4a77c53f94b..7be4d83d5884 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -41,6 +41,7 @@ xfs_resizefs_init_new_ags( xfs_agnumber_t oagcount, xfs_agnumber_t nagcount, xfs_rfsblock_t delta, + struct xfs_perag *last_pag, bool *lastag_extended) { struct xfs_mount *mp = tp->t_mountp; @@ -73,7 +74,7 @@ xfs_resizefs_init_new_ags( if (delta) { *lastag_extended = true; - error = xfs_ag_extend_space(mp, tp, id, delta); + error = xfs_ag_extend_space(last_pag, tp, delta); } return error; } @@ -96,6 +97,7 @@ xfs_growfs_data_private( xfs_agnumber_t oagcount; struct xfs_trans *tp; struct aghdr_init_data id = {}; + struct xfs_perag *last_pag; nb = in->newblocks; error = xfs_sb_validate_fsb_count(&mp->m_sb, nb); @@ -128,7 +130,6 @@ xfs_growfs_data_private( return -EINVAL; oagcount = mp->m_sb.sb_agcount; - /* allocate the new per-ag structures */ if (nagcount > oagcount) { error = xfs_initialize_perag(mp, nagcount, &nagimax); @@ -145,15 +146,17 @@ xfs_growfs_data_private( if (error) return error; + last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { error = xfs_resizefs_init_new_ags(tp, &id, oagcount, nagcount, - delta, &lastag_extended); + delta, last_pag, &lastag_extended); } else { xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SHRINK, "EXPERIMENTAL online shrink feature in use. Use at your own risk!"); - error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta); + error = xfs_ag_shrink_space(last_pag, &tp, -delta); } + xfs_perag_put(last_pag); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0d67ff8a8961..f21581cd8a70 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -955,6 +955,7 @@ xfs_ioc_ag_geometry( struct xfs_mount *mp, void __user *arg) { + struct xfs_perag *pag; struct xfs_ag_geometry ageo; int error; @@ -965,7 +966,12 @@ xfs_ioc_ag_geometry( if (memchr_inv(&ageo.ag_reserved, 0, sizeof(ageo.ag_reserved))) return -EINVAL; - error = xfs_ag_get_geometry(mp, ageo.ag_number, &ageo); + pag = xfs_perag_get(mp, ageo.ag_number); + if (!pag) + return -EINVAL; + + error = xfs_ag_get_geometry(pag, &ageo); + xfs_perag_put(pag); if (error) return error; -- cgit v1.2.3 From a95fee40e3d433d8fabff7c02e75f7c2c2e54400 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:07:16 +1000 Subject: xfs: kill xfs_ialloc_pagi_init() This is just a basic wrapper around xfs_ialloc_read_agi(), which can be entirely handled by xfs_ialloc_read_agi() by passing a NULL agibpp.... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 3 ++- fs/xfs/libxfs/xfs_ialloc.c | 39 ++++++++++++++------------------------- fs/xfs/libxfs/xfs_ialloc.h | 10 ---------- 3 files changed, 16 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 4bb316d89e57..c203dc883f5d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -128,9 +128,10 @@ xfs_initialize_perag_data( if (error) return error; - error = xfs_ialloc_pagi_init(mp, NULL, index); + error = xfs_ialloc_read_agi(mp, NULL, index, NULL); if (error) return error; + pag = xfs_perag_get(mp, index); ifree += pag->pagi_freecount; ialloc += pag->pagi_count; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index bf2f4bc89193..cefac2a1ba0c 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1610,7 +1610,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagi_init) { - error = xfs_ialloc_pagi_init(mp, tp, pag->pag_agno); + error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, NULL); if (error) return false; } @@ -2593,25 +2593,30 @@ xfs_read_agi( return 0; } +/* + * Read in the agi and initialise the per-ag data. If the caller supplies a + * @agibpp, return the locked AGI buffer to them, otherwise release it. + */ int xfs_ialloc_read_agi( struct xfs_mount *mp, /* file system mount structure */ struct xfs_trans *tp, /* transaction pointer */ xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_buf **agibpp) { + struct xfs_buf *agibp; struct xfs_agi *agi; /* allocation group header */ struct xfs_perag *pag; /* per allocation group data */ int error; trace_xfs_ialloc_read_agi(mp, agno); - error = xfs_read_agi(mp, tp, agno, bpp); + error = xfs_read_agi(mp, tp, agno, &agibp); if (error) return error; - agi = (*bpp)->b_addr; - pag = (*bpp)->b_pag; + agi = agibp->b_addr; + pag = agibp->b_pag; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2624,26 +2629,10 @@ xfs_ialloc_read_agi( */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || xfs_is_shutdown(mp)); - return 0; -} - -/* - * Read in the agi to initialise the per-ag data in the mount structure - */ -int -xfs_ialloc_pagi_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno) /* allocation group number */ -{ - struct xfs_buf *bp = NULL; - int error; - - error = xfs_ialloc_read_agi(mp, tp, agno, &bp); - if (error) - return error; - if (bp) - xfs_trans_brelse(tp, bp); + if (agibpp) + *agibpp = agibp; + else + xfs_trans_brelse(tp, agibp); return 0; } diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index a7705b6a1fd3..1ff42bf1e4b3 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -72,16 +72,6 @@ xfs_ialloc_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp); /* allocation group hdr buf */ -/* - * Read in the allocation group header to initialise the per-ag data - * in the mount structure - */ -int -xfs_ialloc_pagi_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno); /* allocation group number */ - /* * Lookup a record by ino in the btree given by cur. */ -- cgit v1.2.3 From 99b13c7f0bd35dd3cf2cacb61beb4557dc2b6f9b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:07:24 +1000 Subject: xfs: pass perag to xfs_ialloc_read_agi() xfs_ialloc_read_agi() initialises the perag if it hasn't been done yet, so it makes sense to pass it the perag rather than pull a reference from the buffer. This allows callers to be per-ag centric rather than passing mount/agno pairs everywhere. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 22 ++++++++++++---------- fs/xfs/libxfs/xfs_ialloc.c | 23 ++++++++++------------- fs/xfs/libxfs/xfs_ialloc.h | 7 ++----- fs/xfs/libxfs/xfs_ialloc_btree.c | 9 ++++----- fs/xfs/scrub/common.c | 2 +- fs/xfs/scrub/fscounters.c | 2 +- fs/xfs/scrub/repair.c | 2 +- 7 files changed, 31 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index c203dc883f5d..a16c985940d3 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -128,11 +128,13 @@ xfs_initialize_perag_data( if (error) return error; - error = xfs_ialloc_read_agi(mp, NULL, index, NULL); - if (error) + pag = xfs_perag_get(mp, index); + error = xfs_ialloc_read_agi(pag, NULL, NULL); + if (error) { + xfs_perag_put(pag); return error; + } - pag = xfs_perag_get(mp, index); ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; @@ -784,7 +786,7 @@ xfs_ag_shrink_space( int error, err2; ASSERT(pag->pag_agno == mp->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(mp, *tpp, pag->pag_agno, &agibp); + error = xfs_ialloc_read_agi(pag, *tpp, &agibp); if (error) return error; @@ -817,7 +819,7 @@ xfs_ag_shrink_space( * Disable perag reservations so it doesn't cause the allocation request * to fail. We'll reestablish reservation before we return. */ - error = xfs_ag_resv_free(agibp->b_pag); + error = xfs_ag_resv_free(pag); if (error) return error; @@ -846,7 +848,7 @@ xfs_ag_shrink_space( be32_add_cpu(&agi->agi_length, -delta); be32_add_cpu(&agf->agf_length, -delta); - err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + err2 = xfs_ag_resv_init(pag, *tpp); if (err2) { be32_add_cpu(&agi->agi_length, delta); be32_add_cpu(&agf->agf_length, delta); @@ -870,8 +872,9 @@ xfs_ag_shrink_space( xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; + resv_init_out: - err2 = xfs_ag_resv_init(agibp->b_pag, *tpp); + err2 = xfs_ag_resv_init(pag, *tpp); if (!err2) return error; resv_err: @@ -896,7 +899,7 @@ xfs_ag_extend_space( ASSERT(pag->pag_agno == pag->pag_mount->m_sb.sb_agcount - 1); - error = xfs_ialloc_read_agi(pag->pag_mount, tp, pag->pag_agno, &bp); + error = xfs_ialloc_read_agi(pag, tp, &bp); if (error) return error; @@ -947,8 +950,7 @@ xfs_ag_get_geometry( int error; /* Lock the AG headers. */ - error = xfs_ialloc_read_agi(pag->pag_mount, NULL, pag->pag_agno, - &agi_bp); + error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); if (error) return error; error = xfs_alloc_read_agf(pag->pag_mount, NULL, pag->pag_agno, 0, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index cefac2a1ba0c..a7259404377d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1610,7 +1610,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagi_init) { - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, NULL); + error = xfs_ialloc_read_agi(pag, tp, NULL); if (error) return false; } @@ -1679,7 +1679,7 @@ xfs_dialloc_try_ag( * Then read in the AGI buffer and recheck with the AGI buffer * lock held. */ - error = xfs_ialloc_read_agi(pag->pag_mount, *tpp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, *tpp, &agbp); if (error) return error; @@ -2169,7 +2169,7 @@ xfs_difree( /* * Get the allocation group header. */ - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", __func__, error); @@ -2215,7 +2215,7 @@ xfs_imap_lookup( int error; int i; - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) { xfs_alert(mp, "%s: xfs_ialloc_read_agi() returned error %d, agno %d", @@ -2599,24 +2599,21 @@ xfs_read_agi( */ int xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ + struct xfs_perag *pag, + struct xfs_trans *tp, struct xfs_buf **agibpp) { struct xfs_buf *agibp; - struct xfs_agi *agi; /* allocation group header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_agi *agi; int error; - trace_xfs_ialloc_read_agi(mp, agno); + trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(pag->pag_mount, tp, pag->pag_agno, &agibp); if (error) return error; agi = agibp->b_addr; - pag = agibp->b_pag; if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); pag->pagi_count = be32_to_cpu(agi->agi_count); @@ -2628,7 +2625,7 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - xfs_is_shutdown(mp)); + xfs_is_shutdown(pag->pag_mount)); if (agibpp) *agibpp = agibp; else diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 1ff42bf1e4b3..72cb33170d9f 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -66,11 +66,8 @@ xfs_ialloc_log_agi( * Read in the allocation group header (inode allocation section) */ int /* error */ -xfs_ialloc_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp); /* allocation group hdr buf */ +xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); /* * Lookup a record by ino in the btree given by cur. diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index b2ad2fdc40f5..aa4367a0a0de 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -722,7 +722,7 @@ xfs_inobt_cur( ASSERT(*agi_bpp == NULL); ASSERT(*curpp == NULL); - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, agi_bpp); + error = xfs_ialloc_read_agi(pag, tp, agi_bpp); if (error) return error; @@ -757,16 +757,15 @@ xfs_inobt_count_blocks( /* Read finobt block count from AGI header. */ static int xfs_finobt_read_blocks( - struct xfs_mount *mp, - struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_trans *tp, xfs_extlen_t *tree_blocks) { struct xfs_buf *agbp; struct xfs_agi *agi; int error; - error = xfs_ialloc_read_agi(mp, tp, pag->pag_agno, &agbp); + error = xfs_ialloc_read_agi(pag, tp, &agbp); if (error) return error; @@ -794,7 +793,7 @@ xfs_finobt_calc_reserves( return 0; if (xfs_has_inobtcounts(mp)) - error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len); + error = xfs_finobt_read_blocks(pag, tp, &tree_len); else error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, &tree_len); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 97b54ac3075f..62997791694a 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -416,7 +416,7 @@ xchk_ag_read_headers( if (!sa->pag) return -ENOENT; - error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp); + error = xfs_ialloc_read_agi(sa->pag, sc->tp, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index 48a6cbdf95d0..bd06a184c81c 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -78,7 +78,7 @@ xchk_fscount_warmup( continue; /* Lock both AG headers. */ - error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); + error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); if (error) break; error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1e7b6b209ee8..14acf1df3dd3 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -199,7 +199,7 @@ xrep_calc_ag_resblks( icount = pag->pagi_count; } else { /* Try to get the actual counters from disk. */ - error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp); + error = xfs_ialloc_read_agi(pag, NULL, &bp); if (!error) { icount = pag->pagi_count; xfs_buf_relse(bp); -- cgit v1.2.3 From 76b47e528e3a27a3bf3b3f9153aad9435e03be8c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:07:32 +1000 Subject: xfs: kill xfs_alloc_pagf_init() Trivial wrapper around xfs_alloc_read_agf(), can be easily replaced by passing a NULL agfbp to xfs_alloc_read_agf(). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_ag_resv.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 37 ++++++++++++------------------------- fs/xfs/libxfs/xfs_alloc.h | 10 ---------- fs/xfs/libxfs/xfs_bmap.c | 3 ++- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/xfs_filestream.c | 4 ++-- 7 files changed, 19 insertions(+), 41 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index a16c985940d3..2ed15839fb3e 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -124,7 +124,7 @@ xfs_initialize_perag_data( * all the information we need and populates the * per-ag structures for us. */ - error = xfs_alloc_pagf_init(mp, NULL, index, 0); + error = xfs_alloc_read_agf(mp, NULL, index, 0, NULL); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fe94058d4e9e..ce28bf8f72dc 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -322,7 +322,7 @@ out: * address. */ if (has_resv) { - error2 = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, 0); + error2 = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, NULL); if (error2) return error2; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index d3f2886fdc08..f7853ab7b962 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2867,25 +2867,6 @@ xfs_alloc_log_agf( xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); } -/* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags) /* XFS_ALLOC_FLAGS_... */ -{ - struct xfs_buf *bp; - int error; - - error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp); - if (!error) - xfs_trans_brelse(tp, bp); - return error; -} - /* * Put the block on the freelist for the allocation group. */ @@ -3095,7 +3076,9 @@ xfs_read_agf( } /* - * Read in the allocation group header (free/alloc section). + * Read in the allocation group header (free/alloc section) and initialise the + * perag structure if necessary. If the caller provides @agfbpp, then return the + * locked buffer to the caller, otherwise free it. */ int /* error */ xfs_alloc_read_agf( @@ -3103,8 +3086,9 @@ xfs_alloc_read_agf( struct xfs_trans *tp, /* transaction pointer */ xfs_agnumber_t agno, /* allocation group number */ int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_buf **agfbpp) { + struct xfs_buf *agfbp; struct xfs_agf *agf; /* ag freelist header */ struct xfs_perag *pag; /* per allocation group data */ int error; @@ -3118,13 +3102,12 @@ xfs_alloc_read_agf( ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, - bpp); + &agfbp); if (error) return error; - ASSERT(!(*bpp)->b_error); - agf = (*bpp)->b_addr; - pag = (*bpp)->b_pag; + agf = agfbp->b_addr; + pag = agfbp->b_pag; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3165,6 +3148,10 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif + if (agfbpp) + *agfbpp = agfbp; + else + xfs_trans_brelse(tp, agfbp); return 0; } diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 84ca09b2223f..96d5301a5c8b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -123,16 +123,6 @@ xfs_alloc_log_agf( struct xfs_buf *bp, /* buffer for a.g. freelist header */ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ -/* - * Interface for inode allocation to force the pag data to be initialized. - */ -int /* error */ -xfs_alloc_pagf_init( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags); /* XFS_ALLOC_FLAGS_... */ - /* * Put the block on the freelist for the allocation group. */ diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6833110d1bd4..a76d5894641b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3185,7 +3185,8 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + error = xfs_alloc_read_agf(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK, + NULL); if (error) { /* Couldn't lock the AGF, so skip this AG. */ if (error == -EAGAIN) { diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index a7259404377d..8e252207b131 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1621,7 +1621,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagf_init) { - error = xfs_alloc_pagf_init(mp, tp, pag->pag_agno, flags); + error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, flags, NULL); if (error) return false; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index be9bcf8a1f99..6b09a30f8d06 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -126,7 +126,7 @@ xfs_filestream_pick_ag( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + err = xfs_alloc_read_agf(mp, NULL, ag, trylock, NULL); if (err) { if (err != -EAGAIN) { xfs_perag_put(pag); @@ -181,7 +181,7 @@ next_ag: if (ag != startag) continue; - /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + /* Allow sleeping in xfs_alloc_read_agf() on the 2nd pass. */ if (trylock != 0) { trylock = 0; continue; -- cgit v1.2.3 From 08d3e84feeb8cb8e20d54f659446b98fe17913aa Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:07:40 +1000 Subject: xfs: pass perag to xfs_alloc_read_agf() xfs_alloc_read_agf() initialises the perag if it hasn't been done yet, so it makes sense to pass it the perag rather than pull a reference from the buffer. This allows callers to be per-ag centric rather than passing mount/agno pairs everywhere. Whilst modifying the xfs_reflink_find_shared() function definition, declare it static and remove the extern declaration as it is an internal function only these days. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 20 ++++++++----------- fs/xfs/libxfs/xfs_ag_resv.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 31 +++++++++++++---------------- fs/xfs/libxfs/xfs_alloc.h | 13 ++----------- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 2 +- fs/xfs/libxfs/xfs_refcount.c | 6 +++--- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- fs/xfs/scrub/agheader_repair.c | 6 ++---- fs/xfs/scrub/bmap.c | 2 +- fs/xfs/scrub/common.c | 2 +- fs/xfs/scrub/fscounters.c | 2 +- fs/xfs/scrub/repair.c | 5 +++-- fs/xfs/xfs_discard.c | 2 +- fs/xfs/xfs_extfree_item.c | 6 +++++- fs/xfs/xfs_filestream.c | 2 +- fs/xfs/xfs_fsmap.c | 3 +-- fs/xfs/xfs_reflink.c | 40 +++++++++++++++++++++----------------- fs/xfs/xfs_reflink.h | 3 --- 20 files changed, 70 insertions(+), 83 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 2ed15839fb3e..1fa172501b3d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -120,16 +120,13 @@ xfs_initialize_perag_data( for (index = 0; index < agcount; index++) { /* - * read the agf, then the agi. This gets us - * all the information we need and populates the - * per-ag structures for us. + * Read the AGF and AGI buffers to populate the per-ag + * structures for us. */ - error = xfs_alloc_read_agf(mp, NULL, index, 0, NULL); - if (error) - return error; - pag = xfs_perag_get(mp, index); - error = xfs_ialloc_read_agi(pag, NULL, NULL); + error = xfs_alloc_read_agf(pag, NULL, 0, NULL); + if (!error) + error = xfs_ialloc_read_agi(pag, NULL, NULL); if (error) { xfs_perag_put(pag); return error; @@ -792,7 +789,7 @@ xfs_ag_shrink_space( agi = agibp->b_addr; - error = xfs_alloc_read_agf(mp, *tpp, pag->pag_agno, 0, &agfbp); + error = xfs_alloc_read_agf(pag, *tpp, 0, &agfbp); if (error) return error; @@ -910,7 +907,7 @@ xfs_ag_extend_space( /* * Change agf length. */ - error = xfs_alloc_read_agf(pag->pag_mount, tp, pag->pag_agno, 0, &bp); + error = xfs_alloc_read_agf(pag, tp, 0, &bp); if (error) return error; @@ -953,8 +950,7 @@ xfs_ag_get_geometry( error = xfs_ialloc_read_agi(pag, NULL, &agi_bp); if (error) return error; - error = xfs_alloc_read_agf(pag->pag_mount, NULL, pag->pag_agno, 0, - &agf_bp); + error = xfs_alloc_read_agf(pag, NULL, 0, &agf_bp); if (error) goto out_agi; diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index ce28bf8f72dc..5af123d13a63 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -322,7 +322,7 @@ out: * address. */ if (has_resv) { - error2 = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, NULL); + error2 = xfs_alloc_read_agf(pag, tp, 0, NULL); if (error2) return error2; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index f7853ab7b962..6912c4efc61e 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2609,7 +2609,7 @@ xfs_alloc_fix_freelist( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); if (!pag->pagf_init) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -2639,7 +2639,7 @@ xfs_alloc_fix_freelist( * Can fail if we're not blocking on locks, and it's held. */ if (!agbp) { - error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + error = xfs_alloc_read_agf(pag, tp, flags, &agbp); if (error) { /* Couldn't lock the AGF so skip this AG. */ if (error == -EAGAIN) @@ -3080,34 +3080,30 @@ xfs_read_agf( * perag structure if necessary. If the caller provides @agfbpp, then return the * locked buffer to the caller, otherwise free it. */ -int /* error */ +int xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, struct xfs_buf **agfbpp) { struct xfs_buf *agfbp; - struct xfs_agf *agf; /* ag freelist header */ - struct xfs_perag *pag; /* per allocation group data */ + struct xfs_agf *agf; int error; int allocbt_blks; - trace_xfs_alloc_read_agf(mp, agno); + trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); - ASSERT(agno != NULLAGNUMBER); - error = xfs_read_agf(mp, tp, agno, + error = xfs_read_agf(pag->pag_mount, tp, pag->pag_agno, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, &agfbp); if (error) return error; agf = agfbp->b_addr; - pag = agfbp->b_pag; if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -3121,7 +3117,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); pag->pagf_init = 1; - pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); + pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf); /* * Update the in-core allocbt counter. Filter out the rmapbt @@ -3131,13 +3127,14 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_has_rmapbt(mp)) + if (xfs_has_rmapbt(pag->pag_mount)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) - atomic64_add(allocbt_blks, &mp->m_allocbt_blks); + atomic64_add(allocbt_blks, + &pag->pag_mount->m_allocbt_blks); } #ifdef DEBUG - else if (!xfs_is_shutdown(mp)) { + else if (!xfs_is_shutdown(pag->pag_mount)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 96d5301a5c8b..b8cf5beb26d4 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -134,17 +134,6 @@ xfs_alloc_put_freelist( xfs_agblock_t bno, /* block being freed */ int btreeblk); /* owner was a AGF btree */ -/* - * Read in the allocation group header (free/alloc section). - */ -int /* error */ -xfs_alloc_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - struct xfs_buf **bpp); /* buffer for the ag freelist header */ - /* * Allocate an extent (variable-size). */ @@ -198,6 +187,8 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a76d5894641b..88828fcf0453 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3185,7 +3185,7 @@ xfs_bmap_longest_free_extent( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - error = xfs_alloc_read_agf(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK, + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_TRYLOCK, NULL); if (error) { /* Couldn't lock the AGF, so skip this AG. */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 8e252207b131..dfa8061f65d9 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1621,7 +1621,7 @@ xfs_dialloc_good_ag( return false; if (!pag->pagf_init) { - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, flags, NULL); + error = xfs_alloc_read_agf(pag, tp, flags, NULL); if (error) return false; } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 97e9e6020596..098dac888c22 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1177,8 +1177,8 @@ xfs_refcount_finish_one( *pcur = NULL; } if (rcur == NULL) { - error = xfs_alloc_read_agf(tp->t_mountp, tp, pag->pag_agno, - XFS_ALLOC_FLAG_FREEING, &agbp); + error = xfs_alloc_read_agf(pag, tp, XFS_ALLOC_FLAG_FREEING, + &agbp); if (error) goto out_drop; @@ -1710,7 +1710,7 @@ xfs_refcount_recover_cow_leftovers( if (error) return error; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) goto out_trans; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, pag); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index d14c1720b0fb..1063234df34a 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -493,7 +493,7 @@ xfs_refcountbt_calc_reserves( if (!xfs_has_reflink(mp)) return 0; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 69e104d0277f..d6d45992fe7b 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -652,7 +652,7 @@ xfs_rmapbt_calc_reserves( if (!xfs_has_rmapbt(mp)) return 0; - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 6da7f2ca77de..230bdfe36e80 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -666,8 +666,7 @@ xrep_agfl( * nothing wrong with the AGF, but all the AG header repair functions * have this chicken-and-egg problem. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, - &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; @@ -742,8 +741,7 @@ xrep_agi_find_btrees( int error; /* Read the AGF. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, - &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 285995ba3947..9353fd060525 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -540,7 +540,7 @@ xchk_bmap_check_ag_rmaps( struct xfs_buf *agf; int error; - error = xfs_alloc_read_agf(sc->mp, sc->tp, pag->pag_agno, 0, &agf); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf); if (error) return error; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 62997791694a..cd7d4ebd240b 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -420,7 +420,7 @@ xchk_ag_read_headers( if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) return error; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp); + error = xfs_alloc_read_agf(sa->pag, sc->tp, 0, &sa->agf_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index bd06a184c81c..6a6f8fe7f87c 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -81,7 +81,7 @@ xchk_fscount_warmup( error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); if (error) break; - error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); if (error) break; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 14acf1df3dd3..1c66f7ee6282 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -207,7 +207,7 @@ xrep_calc_ag_resblks( } /* Now grab the block counters from the AGF. */ - error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); + error = xfs_alloc_read_agf(pag, NULL, 0, &bp); if (error) { aglen = xfs_ag_block_count(mp, sm->sm_agno); freelen = aglen; @@ -543,6 +543,7 @@ xrep_reap_block( agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + ASSERT(agno == sc->sa.pag->pag_agno); /* * If we are repairing per-inode metadata, we need to read in the AGF @@ -550,7 +551,7 @@ xrep_reap_block( * the AGF buffer that the setup functions already grabbed. */ if (sc->ip) { - error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp); if (error) return error; } else { diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index c6fe3f6ebb6b..bfc829c07f03 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -45,7 +45,7 @@ xfs_trim_extents( */ xfs_log_force(mp, XFS_LOG_SYNC); - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) goto out_put_perag; agf = agbp->b_addr; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 765be054dffe..0d0a0b37d8c5 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -11,6 +11,7 @@ #include "xfs_bit.h" #include "xfs_shared.h" #include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_defer.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" @@ -551,6 +552,7 @@ xfs_agfl_free_finish_item( xfs_agnumber_t agno; xfs_agblock_t agbno; uint next_extent; + struct xfs_perag *pag; free = container_of(item, struct xfs_extent_free_item, xefi_list); ASSERT(free->xefi_blockcount == 1); @@ -560,9 +562,11 @@ xfs_agfl_free_finish_item( trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (!error) error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo); + xfs_perag_put(pag); /* * Mark the transaction dirty, even on error. This ensures the diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 6b09a30f8d06..34b21a29c39b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -126,7 +126,7 @@ xfs_filestream_pick_ag( pag = xfs_perag_get(mp, ag); if (!pag->pagf_init) { - err = xfs_alloc_read_agf(mp, NULL, ag, trylock, NULL); + err = xfs_alloc_read_agf(pag, NULL, trylock, NULL); if (err) { if (err != -EAGAIN) { xfs_perag_put(pag); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index bb23199f65c3..d8337274c74d 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -642,8 +642,7 @@ __xfs_getfsmap_datadev( info->agf_bp = NULL; } - error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, - &info->agf_bp); + error = xfs_alloc_read_agf(pag, tp, 0, &info->agf_bp); if (error) break; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7a7c00d93be..81994f4706de 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -125,11 +125,10 @@ * shared blocks. If there are no shared extents, fbno and flen will * be set to NULLAGBLOCK and 0, respectively. */ -int +static int xfs_reflink_find_shared( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, @@ -140,11 +139,11 @@ xfs_reflink_find_shared( struct xfs_btree_cur *cur; int error; - error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + error = xfs_alloc_read_agf(pag, tp, 0, &agbp); if (error) return error; - cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agbp->b_pag); + cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag); error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, find_end_of_shared); @@ -171,7 +170,8 @@ xfs_reflink_trim_around_shared( struct xfs_bmbt_irec *irec, bool *shared) { - xfs_agnumber_t agno; + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; xfs_agblock_t agbno; xfs_extlen_t aglen; xfs_agblock_t fbno; @@ -186,12 +186,13 @@ xfs_reflink_trim_around_shared( trace_xfs_reflink_trim_around_shared(ip, irec); - agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); - agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock)); + agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); aglen = irec->br_blockcount; - error = xfs_reflink_find_shared(ip->i_mount, NULL, agno, agbno, - aglen, &fbno, &flen, true); + error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen, + true); + xfs_perag_put(pag); if (error) return error; @@ -1420,11 +1421,6 @@ xfs_reflink_inode_has_shared_extents( struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - xfs_extlen_t aglen; - xfs_agblock_t rbno; - xfs_extlen_t rlen; struct xfs_iext_cursor icur; bool found; int error; @@ -1437,17 +1433,25 @@ xfs_reflink_inode_has_shared_extents( *has_shared = false; found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got); while (found) { + struct xfs_perag *pag; + xfs_agblock_t agbno; + xfs_extlen_t aglen; + xfs_agblock_t rbno; + xfs_extlen_t rlen; + if (isnullstartblock(got.br_startblock) || got.br_state != XFS_EXT_NORM) goto next; - agno = XFS_FSB_TO_AGNO(mp, got.br_startblock); + + pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock)); agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock); aglen = got.br_blockcount; - - error = xfs_reflink_find_shared(mp, tp, agno, agbno, aglen, + error = xfs_reflink_find_shared(pag, tp, agbno, aglen, &rbno, &rlen, false); + xfs_perag_put(pag); if (error) return error; + /* Is there still a shared block here? */ if (rbno != NULLAGBLOCK) { *has_shared = true; diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index bea65f2fe657..65c5dfe17ecf 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -16,9 +16,6 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip) return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); } -extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, - xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, struct xfs_bmbt_irec *irec, bool *shared); int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, -- cgit v1.2.3 From 61021deb1faa5b2b913bf0ad76e2769276160b04 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:07:47 +1000 Subject: xfs: pass perag to xfs_read_agi We have the perag in most palces we call xfs_read_agi, so pass the perag instead of a mount/agno pair. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 21 ++++++++++----------- fs/xfs/libxfs/xfs_ialloc.h | 10 +++------- fs/xfs/xfs_inode.c | 14 ++++++++------ fs/xfs/xfs_log_recover.c | 38 +++++++++++++++++++------------------- 4 files changed, 40 insertions(+), 43 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index dfa8061f65d9..55757b990ac6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2571,25 +2571,24 @@ const struct xfs_buf_ops xfs_agi_buf_ops = { */ int xfs_read_agi( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* allocation group hdr buf */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **agibpp) { + struct xfs_mount *mp = pag->pag_mount; int error; - trace_xfs_read_agi(mp, agno); + trace_xfs_read_agi(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops); if (error) return error; if (tp) - xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); + xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF); - xfs_buf_set_ref(*bpp, XFS_AGI_REF); + xfs_buf_set_ref(*agibpp, XFS_AGI_REF); return 0; } @@ -2609,7 +2608,7 @@ xfs_ialloc_read_agi( trace_xfs_ialloc_read_agi(pag->pag_mount, pag->pag_agno); - error = xfs_read_agi(pag->pag_mount, tp, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 72cb33170d9f..9bbbca6ac4ed 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -62,11 +62,9 @@ xfs_ialloc_log_agi( struct xfs_buf *bp, /* allocation group header buffer */ uint32_t fields); /* bitmask of fields to log */ -/* - * Read in the allocation group header (inode allocation section) - */ -int /* error */ -xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, +int xfs_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **agibpp); +int xfs_ialloc_read_agi(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf **agibpp); /* @@ -89,8 +87,6 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); -int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3e1c62ffa4f7..29dfc997420f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2143,7 +2143,7 @@ xfs_iunlink( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) goto out; agi = agibp->b_addr; @@ -2328,7 +2328,7 @@ xfs_iunlink_remove( trace_xfs_iunlink_remove(ip); /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) return error; agi = agibp->b_addr; @@ -3229,11 +3229,13 @@ retry: if (inodes[i] == wip || (inodes[i] == target_ip && (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { - struct xfs_buf *bp; - xfs_agnumber_t agno; + struct xfs_perag *pag; + struct xfs_buf *bp; - agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino); - error = xfs_read_agi(mp, tp, agno, &bp); + pag = xfs_perag_get(mp, + XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); + error = xfs_read_agi(pag, tp, &bp); + xfs_perag_put(pag); if (error) goto out_trans_cancel; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5f7e4e6e33ce..38aae3409c96 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2629,21 +2629,21 @@ xlog_recover_cancel_intents( */ STATIC void xlog_recover_clear_agi_bucket( - xfs_mount_t *mp, - xfs_agnumber_t agno, - int bucket) + struct xfs_perag *pag, + int bucket) { - xfs_trans_t *tp; - xfs_agi_t *agi; - struct xfs_buf *agibp; - int offset; - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_trans *tp; + struct xfs_agi *agi; + struct xfs_buf *agibp; + int offset; + int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp); if (error) goto out_error; - error = xfs_read_agi(mp, tp, agno, &agibp); + error = xfs_read_agi(pag, tp, &agibp); if (error) goto out_abort; @@ -2662,14 +2662,14 @@ xlog_recover_clear_agi_bucket( out_abort: xfs_trans_cancel(tp); out_error: - xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, + pag->pag_agno); return; } STATIC xfs_agino_t xlog_recover_process_one_iunlink( - struct xfs_mount *mp, - xfs_agnumber_t agno, + struct xfs_perag *pag, xfs_agino_t agino, int bucket) { @@ -2679,15 +2679,15 @@ xlog_recover_process_one_iunlink( xfs_ino_t ino; int error; - ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + ino = XFS_AGINO_TO_INO(pag->pag_mount, pag->pag_agno, agino); + error = xfs_iget(pag->pag_mount, NULL, ino, 0, 0, &ip); if (error) goto fail; /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &ibp); + error = xfs_imap_to_bp(pag->pag_mount, NULL, &ip->i_imap, &ibp); if (error) goto fail_iput; dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); @@ -2714,7 +2714,7 @@ xlog_recover_process_one_iunlink( * Call xlog_recover_clear_agi_bucket() to perform a transaction to * clear the inode pointer in the bucket. */ - xlog_recover_clear_agi_bucket(mp, agno, bucket); + xlog_recover_clear_agi_bucket(pag, bucket); return NULLAGINO; } @@ -2755,7 +2755,7 @@ xlog_recover_process_iunlinks( int error; for_each_perag(mp, agno, pag) { - error = xfs_read_agi(mp, NULL, pag->pag_agno, &agibp); + error = xfs_read_agi(pag, NULL, &agibp); if (error) { /* * AGI is b0rked. Don't process it. @@ -2780,8 +2780,8 @@ xlog_recover_process_iunlinks( for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - agino = xlog_recover_process_one_iunlink(mp, - pag->pag_agno, agino, bucket); + agino = xlog_recover_process_one_iunlink(pag, + agino, bucket); cond_resched(); } } -- cgit v1.2.3 From fa044ae70c64343b07277256952d22a0dc05b319 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:07:54 +1000 Subject: xfs: pass perag to xfs_read_agf We have the perag in most places we call xfs_read_agf, so pass the perag instead of a mount/agno pair. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 26 ++++++++++++-------------- fs/xfs/libxfs/xfs_alloc.h | 4 ++-- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6912c4efc61e..478dd81e4590 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -3051,27 +3051,25 @@ const struct xfs_buf_ops xfs_agf_buf_ops = { /* * Read in the allocation group header (free/alloc section). */ -int /* error */ +int xfs_read_agf( - struct xfs_mount *mp, /* mount point structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_BUF_ */ - struct xfs_buf **bpp) /* buffer for the ag freelist header */ + struct xfs_perag *pag, + struct xfs_trans *tp, + int flags, + struct xfs_buf **agfbpp) { - int error; + struct xfs_mount *mp = pag->pag_mount; + int error; - trace_xfs_read_agf(mp, agno); + trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); if (error) return error; - ASSERT(!(*bpp)->b_error); - xfs_buf_set_ref(*bpp, XFS_AGF_REF); + xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); return 0; } @@ -3097,7 +3095,7 @@ xfs_alloc_read_agf( /* We don't support trylock when freeing. */ ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); - error = xfs_read_agf(pag->pag_mount, tp, pag->pag_agno, + error = xfs_read_agf(pag, tp, (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, &agfbp); if (error) diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index b8cf5beb26d4..06e69fe9c957 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -185,8 +185,8 @@ xfs_alloc_get_rec( xfs_extlen_t *len, /* output: length of extent */ int *stat); /* output: success/failure */ -int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, + struct xfs_buf **agfbpp); int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, -- cgit v1.2.3 From 49f0d84ec1db5bd46dcf3796fc792fce74ff25a3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:08:01 +1000 Subject: xfs: pass perag to xfs_alloc_get_freelist It's available in all callers, so pass it in so that the perag can be passed further down the stack. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 8 ++++---- fs/xfs/libxfs/xfs_alloc.h | 13 ++----------- fs/xfs/libxfs/xfs_alloc_btree.c | 6 +++--- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- fs/xfs/scrub/repair.c | 6 +++--- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 478dd81e4590..c0a2c73ec1cf 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1075,7 +1075,8 @@ xfs_alloc_ag_vextent_small( be32_to_cpu(agf->agf_flcount) <= args->minleft) goto out; - error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, + &fbno, 0); if (error) goto error; if (fbno == NULLAGBLOCK) @@ -2697,7 +2698,7 @@ xfs_alloc_fix_freelist( else targs.oinfo = XFS_RMAP_OINFO_AG; while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); if (error) goto out_agbp_relse; @@ -2767,6 +2768,7 @@ out_no_agbp: */ int xfs_alloc_get_freelist( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agblock_t *bnop, @@ -2779,7 +2781,6 @@ xfs_alloc_get_freelist( int error; uint32_t logflags; struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; /* * Freelist is empty, give up. @@ -2807,7 +2808,6 @@ xfs_alloc_get_freelist( if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; - pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); pag->pagf_flcount--; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 06e69fe9c957..6349f0e5f93d 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -95,6 +95,8 @@ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_perag *pag, xfs_extlen_t need, xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); +int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk); /* * Compute and fill in value of m_alloc_maxlevels. @@ -103,17 +105,6 @@ void xfs_alloc_compute_maxlevels( struct xfs_mount *mp); /* file system mount structure */ -/* - * Get a block from the freelist. - * Returns with the buffer for the block gotten. - */ -int /* error */ -xfs_alloc_get_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop, /* block address retrieved from freelist */ - int btreeblk); /* destination is a AGF btree */ - /* * Log the given fields from the agf structure. */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 8c9f73cc0bee..a2ead80afb39 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -60,8 +60,8 @@ xfs_allocbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, - &bno, 1); + error = xfs_alloc_get_freelist(cur->bc_ag.pag, cur->bc_tp, + cur->bc_ag.agbp, &bno, 1); if (error) return error; @@ -71,7 +71,7 @@ xfs_allocbt_alloc_block( } atomic64_inc(&cur->bc_mp->m_allocbt_blks); - xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.agbp->b_pag, bno, 1, false); + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_ag.pag, bno, 1, false); new->s = cpu_to_be32(bno); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index d6d45992fe7b..fbbbeda1b06d 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -90,7 +90,7 @@ xfs_rmapbt_alloc_block( xfs_agblock_t bno; /* Allocate the new block from the freelist. If we can't, give up. */ - error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp, + error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp, &bno, 1); if (error) return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1c66f7ee6282..cd6c92b070f8 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -300,13 +300,13 @@ xrep_alloc_ag_block( switch (resv) { case XFS_AG_RESV_AGFL: case XFS_AG_RESV_RMAPBT: - error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1); + error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp, + sc->sa.agf_bp, &bno, 1); if (error) return error; if (bno == NULLAGBLOCK) return -ENOSPC; - xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, - 1, false); + xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); if (resv == XFS_AG_RESV_RMAPBT) xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); -- cgit v1.2.3 From 8c392eb27f7a98e403658d066e387c7b1c604f2b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:08:08 +1000 Subject: xfs: pass perag to xfs_alloc_put_freelist It's available in all callers, so pass it in so that the perag can be passed further down the stack. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 5 ++--- fs/xfs/libxfs/xfs_alloc.h | 14 +++----------- fs/xfs/libxfs/xfs_alloc_btree.c | 3 ++- fs/xfs/libxfs/xfs_rmap_btree.c | 2 +- fs/xfs/scrub/repair.c | 4 ++-- 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c0a2c73ec1cf..9ab0bd07d1d8 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2742,7 +2742,7 @@ xfs_alloc_fix_freelist( * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - error = xfs_alloc_put_freelist(tp, agbp, + error = xfs_alloc_put_freelist(pag, tp, agbp, agflbp, bno, 0); if (error) goto out_agflbp_relse; @@ -2872,6 +2872,7 @@ xfs_alloc_log_agf( */ int xfs_alloc_put_freelist( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agbp, struct xfs_buf *agflbp, @@ -2880,7 +2881,6 @@ xfs_alloc_put_freelist( { struct xfs_mount *mp = tp->t_mountp; struct xfs_agf *agf = agbp->b_addr; - struct xfs_perag *pag; __be32 *blockp; int error; uint32_t logflags; @@ -2894,7 +2894,6 @@ xfs_alloc_put_freelist( if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; - pag = agbp->b_pag; ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); pag->pagf_flcount++; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6349f0e5f93d..d32a70a28c32 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -97,6 +97,9 @@ unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); int xfs_alloc_get_freelist(struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agfbp, xfs_agblock_t *bnop, int btreeblk); +int xfs_alloc_put_freelist(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf *agfbp, struct xfs_buf *agflbp, + xfs_agblock_t bno, int btreeblk); /* * Compute and fill in value of m_alloc_maxlevels. @@ -114,17 +117,6 @@ xfs_alloc_log_agf( struct xfs_buf *bp, /* buffer for a.g. freelist header */ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */ -/* - * Put the block on the freelist for the allocation group. - */ -int /* error */ -xfs_alloc_put_freelist( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* buffer for a.g. freelist header */ - struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno, /* block being freed */ - int btreeblk); /* owner was a AGF btree */ - /* * Allocate an extent (variable-size). */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a2ead80afb39..549a3cba0234 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -89,7 +89,8 @@ xfs_allocbt_free_block( int error; bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + error = xfs_alloc_put_freelist(cur->bc_ag.pag, cur->bc_tp, agbp, NULL, + bno, 1); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index fbbbeda1b06d..1ae14d0c831c 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -129,7 +129,7 @@ xfs_rmapbt_free_block( bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index cd6c92b070f8..c983b76e070f 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -516,8 +516,8 @@ xrep_put_freelist( return error; /* Put the block on the AGFL. */ - error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp, - agbno, 0); + error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp, + sc->sa.agfl_bp, agbno, 0); if (error) return error; xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1, -- cgit v1.2.3 From cec7bb7d58fa0e644f8cec46b081bf5427c1a0f8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:08:15 +1000 Subject: xfs: pass perag to xfs_alloc_read_agfl We have the perag in most places we call xfs_alloc_read_agfl, so pass the perag instead of a mount/agno pair. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 31 ++++++++++++++++--------------- fs/xfs/libxfs/xfs_alloc.h | 4 ++-- fs/xfs/scrub/agheader_repair.c | 2 +- fs/xfs/scrub/common.c | 2 +- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 9ab0bd07d1d8..ea9452950647 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -703,20 +703,19 @@ const struct xfs_buf_ops xfs_agfl_buf_ops = { /* * Read in the allocation group free block array. */ -int /* error */ +int xfs_alloc_read_agfl( - xfs_mount_t *mp, /* mount point structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - struct xfs_buf **bpp) /* buffer for the ag free block array */ + struct xfs_perag *pag, + struct xfs_trans *tp, + struct xfs_buf **bpp) { - struct xfs_buf *bp; /* return value */ - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_buf *bp; + int error; - ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; @@ -2713,7 +2712,7 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) goto out_agbp_relse; @@ -2792,8 +2791,7 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), - &agflbp); + error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) return error; @@ -2887,9 +2885,12 @@ xfs_alloc_put_freelist( __be32 *agfl_bno; int startoff; - if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) - return error; + if (!agflbp) { + error = xfs_alloc_read_agfl(pag, tp, &agflbp); + if (error) + return error; + } + be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d32a70a28c32..2c3f762dfb58 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -172,8 +172,8 @@ int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); int xfs_alloc_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, struct xfs_buf **agfbpp); -int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, struct xfs_buf **bpp); +int xfs_alloc_read_agfl(struct xfs_perag *pag, struct xfs_trans *tp, + struct xfs_buf **bpp); int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 230bdfe36e80..10ac1118a595 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -405,7 +405,7 @@ xrep_agf( * btrees rooted in the AGF. If the AGFL contents are obviously bad * then we'll bail out. */ - error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp); + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); if (error) return error; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index cd7d4ebd240b..9bbbf20f401b 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -424,7 +424,7 @@ xchk_ag_read_headers( if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) return error; - error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp); + error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) return error; -- cgit v1.2.3 From 0800169e3e2c97a033e8b7f3d1e6c689e0d71a19 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:13:02 +1000 Subject: xfs: Pre-calculate per-AG agbno geometry There is a lot of overhead in functions like xfs_verify_agbno() that repeatedly calculate the geometry limits of an AG. These can be pre-calculated as they are static and the verification context has a per-ag context it can quickly reference. In the case of xfs_verify_agbno(), we now always have a perag context handy, so we can store the AG length and the minimum valid block in the AG in the perag. This means we don't have to calculate it on every call and it can be inlined in callers if we move it to xfs_ag.h. Move xfs_ag_block_count() to xfs_ag.c because it's really a per-ag function and not an XFS type function. We need a little bit of rework that is specific to xfs_initialise_perag() to allow growfs to calculate the new perag sizes before we've updated the primary superblock during the grow (chicken/egg situation). Note that we leave the original xfs_verify_agbno in place in xfs_types.c as a static function as other callers in that file do not have per-ag contexts so still need to go the long way. It's been renamed to xfs_verify_agno_agbno() to indicate it takes both an agno and an agbno to differentiate it from new function. Future commits will make similar changes for other per-ag geometry validation functions. Further: $ size --totals fs/xfs/built-in.a text data bss dec hex filename before 1483006 329588 572 1813166 1baaae (TOTALS) after 1482185 329588 572 1812345 1ba779 (TOTALS) This rework reduces the binary size by ~820 bytes, indicating that much less work is being done to bounds check the agbno values against on per-ag geometry information. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 40 +++++++++++++++++++++++++++++++++++++++- fs/xfs/libxfs/xfs_ag.h | 21 ++++++++++++++++++++- fs/xfs/libxfs/xfs_alloc.c | 9 +++++---- fs/xfs/libxfs/xfs_btree.c | 25 ++++++++++--------------- fs/xfs/libxfs/xfs_refcount.c | 13 ++++++------- fs/xfs/libxfs/xfs_rmap.c | 8 ++++---- fs/xfs/libxfs/xfs_types.c | 18 +++--------------- fs/xfs/libxfs/xfs_types.h | 3 --- fs/xfs/scrub/agheader.c | 19 +++++++++---------- fs/xfs/scrub/agheader_repair.c | 7 ++----- fs/xfs/scrub/alloc.c | 7 +++---- fs/xfs/scrub/ialloc.c | 6 +++--- fs/xfs/scrub/refcount.c | 7 +++---- fs/xfs/scrub/rmap.c | 6 +++--- fs/xfs/xfs_fsops.c | 2 +- fs/xfs/xfs_log_recover.c | 3 ++- fs/xfs/xfs_mount.c | 3 ++- 17 files changed, 115 insertions(+), 82 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 1fa172501b3d..8f3e6ee85c34 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -201,10 +201,35 @@ xfs_free_perag( } } +/* Find the size of the AG, in blocks. */ +static xfs_agblock_t +__xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks) +{ + ASSERT(agno < agcount); + + if (agno < agcount - 1) + return mp->m_sb.sb_agblocks; + return dblocks - (agno * mp->m_sb.sb_agblocks); +} + +xfs_agblock_t +xfs_ag_block_count( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + return __xfs_ag_block_count(mp, agno, mp->m_sb.sb_agcount, + mp->m_sb.sb_dblocks); +} + int xfs_initialize_perag( struct xfs_mount *mp, xfs_agnumber_t agcount, + xfs_rfsblock_t dblocks, xfs_agnumber_t *maxagi) { struct xfs_perag *pag; @@ -270,6 +295,13 @@ xfs_initialize_perag( /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; + + /* + * Pre-calculated geometry + */ + pag->block_count = __xfs_ag_block_count(mp, index, agcount, + dblocks); + pag->min_block = XFS_AGFL_BLOCK(mp); } index = xfs_set_inode_alloc(mp, agcount); @@ -927,10 +959,16 @@ xfs_ag_extend_space( if (error) return error; - return xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, + error = xfs_free_extent(tp, XFS_AGB_TO_FSB(pag->pag_mount, pag->pag_agno, be32_to_cpu(agf->agf_length) - len), len, &XFS_RMAP_OINFO_SKIP_UPDATE, XFS_AG_RESV_NONE); + if (error) + return error; + + /* Update perag geometry */ + pag->block_count = be32_to_cpu(agf->agf_length); + return 0; } /* Retrieve AG geometry. */ diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 1132cda9a92f..77640f1409fd 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -67,6 +67,10 @@ struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; + /* Precalculated geometry info */ + xfs_agblock_t block_count; + xfs_agblock_t min_block; + #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ @@ -107,7 +111,7 @@ struct xfs_perag { }; int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, - xfs_agnumber_t *maxagi); + xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); void xfs_free_perag(struct xfs_mount *mp); @@ -116,6 +120,21 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int tag); void xfs_perag_put(struct xfs_perag *pag); +/* + * Per-ag geometry infomation and validation + */ +xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); + +static inline bool +xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) +{ + if (agbno >= pag->block_count) + return false; + if (agbno <= pag->min_block) + return false; + return true; +} + /* * Perag iteration APIs */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ea9452950647..41557c430cb6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -248,7 +248,7 @@ xfs_alloc_get_rec( int *stat) /* output: success/failure */ { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -263,11 +263,11 @@ xfs_alloc_get_rec( goto out_bad_rec; /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, *bno)) + if (!xfs_verify_agbno(pag, *bno)) goto out_bad_rec; if (*bno > *bno + *len) goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, *bno + *len - 1)) + if (!xfs_verify_agbno(pag, *bno + *len - 1)) goto out_bad_rec; return 0; @@ -275,7 +275,8 @@ xfs_alloc_get_rec( out_bad_rec: xfs_warn(mp, "%s Freespace BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno); + cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", + pag->pag_agno); xfs_warn(mp, "start block 0x%x block count 0x%x", *bno, *len); return -EFSCORRUPTED; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 2eecc49fc1b2..06ab364d2de3 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -91,10 +91,9 @@ xfs_btree_check_lblock_siblings( static inline xfs_failaddr_t xfs_btree_check_sblock_siblings( - struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_btree_cur *cur, int level, - xfs_agnumber_t agno, xfs_agblock_t agbno, __be32 dsibling) { @@ -110,7 +109,7 @@ xfs_btree_check_sblock_siblings( if (!xfs_btree_check_sptr(cur, sibling, level + 1)) return __this_address; } else { - if (!xfs_verify_agbno(mp, agno, sibling)) + if (!xfs_verify_agbno(pag, sibling)) return __this_address; } return NULL; @@ -195,11 +194,11 @@ __xfs_btree_check_sblock( struct xfs_buf *bp) { struct xfs_mount *mp = cur->bc_mp; + struct xfs_perag *pag = cur->bc_ag.pag; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_has_crc(mp); xfs_failaddr_t fa; xfs_agblock_t agbno = NULLAGBLOCK; - xfs_agnumber_t agno = NULLAGNUMBER; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -217,16 +216,14 @@ __xfs_btree_check_sblock( cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (bp) { + if (bp) agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); - } - fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, - agbno, block->bb_u.s.bb_rightsib); + fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno, + block->bb_u.s.bb_rightsib); return fa; } @@ -288,7 +285,7 @@ xfs_btree_check_sptr( { if (level <= 0) return false; - return xfs_verify_agbno(cur->bc_mp, cur->bc_ag.pag->pag_agno, agbno); + return xfs_verify_agbno(cur->bc_ag.pag, agbno); } /* @@ -4595,7 +4592,6 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_failaddr_t fa; @@ -4604,12 +4600,11 @@ xfs_btree_sblock_verify( return __this_address; /* sibling pointer verification */ - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); - fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, block->bb_u.s.bb_leftsib); if (!fa) - fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno, block->bb_u.s.bb_rightsib); return fa; } diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 098dac888c22..64b910caafaa 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -111,7 +111,7 @@ xfs_refcount_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; xfs_agblock_t realstart; @@ -121,8 +121,6 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - - agno = cur->bc_ag.pag->pag_agno; if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) goto out_bad_rec; @@ -137,22 +135,23 @@ xfs_refcount_get_rec( } /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, realstart)) + if (!xfs_verify_agbno(pag, realstart)) goto out_bad_rec; if (realstart > realstart + irec->rc_blockcount) goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, realstart + irec->rc_blockcount - 1)) + if (!xfs_verify_agbno(pag, realstart + irec->rc_blockcount - 1)) goto out_bad_rec; if (irec->rc_refcount == 0 || irec->rc_refcount > MAXREFCOUNT) goto out_bad_rec; - trace_xfs_refcount_get(cur->bc_mp, cur->bc_ag.pag->pag_agno, irec); + trace_xfs_refcount_get(cur->bc_mp, pag->pag_agno, irec); return 0; out_bad_rec: xfs_warn(mp, - "Refcount BTree record corruption in AG %d detected!", agno); + "Refcount BTree record corruption in AG %d detected!", + pag->pag_agno); xfs_warn(mp, "Start block 0x%x, block count 0x%x, references 0x%x", irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 2845019d31da..094dfc897ebc 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -215,7 +215,7 @@ xfs_rmap_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = cur->bc_ag.pag; union xfs_btree_rec *rec; int error; @@ -235,12 +235,12 @@ xfs_rmap_get_rec( goto out_bad_rec; } else { /* check for valid extent range, including overflow */ - if (!xfs_verify_agbno(mp, agno, irec->rm_startblock)) + if (!xfs_verify_agbno(pag, irec->rm_startblock)) goto out_bad_rec; if (irec->rm_startblock > irec->rm_startblock + irec->rm_blockcount) goto out_bad_rec; - if (!xfs_verify_agbno(mp, agno, + if (!xfs_verify_agbno(pag, irec->rm_startblock + irec->rm_blockcount - 1)) goto out_bad_rec; } @@ -254,7 +254,7 @@ xfs_rmap_get_rec( out_bad_rec: xfs_warn(mp, "Reverse Mapping BTree record corruption in AG %d detected!", - agno); + pag->pag_agno); xfs_warn(mp, "Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x", irec->rm_owner, irec->rm_flags, irec->rm_startblock, diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index e810d23f2d97..b3c6b0274e95 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -13,25 +13,13 @@ #include "xfs_mount.h" #include "xfs_ag.h" -/* Find the size of the AG, in blocks. */ -inline xfs_agblock_t -xfs_ag_block_count( - struct xfs_mount *mp, - xfs_agnumber_t agno) -{ - ASSERT(agno < mp->m_sb.sb_agcount); - - if (agno < mp->m_sb.sb_agcount - 1) - return mp->m_sb.sb_agblocks; - return mp->m_sb.sb_dblocks - (agno * mp->m_sb.sb_agblocks); -} /* * Verify that an AG block number pointer neither points outside the AG * nor points at static metadata. */ -inline bool -xfs_verify_agbno( +static inline bool +xfs_verify_agno_agbno( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno) @@ -59,7 +47,7 @@ xfs_verify_fsbno( if (agno >= mp->m_sb.sb_agcount) return false; - return xfs_verify_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); + return xfs_verify_agno_agbno(mp, agno, XFS_FSB_TO_AGBNO(mp, fsbno)); } /* diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 373f64a492a4..ccf61afb959d 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -179,9 +179,6 @@ enum xfs_ag_resv_type { */ struct xfs_mount; -xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); -bool xfs_verify_agbno(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno); bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 90aebfe9dc5f..181bba5f9b8f 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -541,16 +541,16 @@ xchk_agf( /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); - if (eoag != xfs_ag_block_count(mp, agno)) + if (eoag != pag->block_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Check the AGF btree roots and levels */ agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); @@ -563,7 +563,7 @@ xchk_agf( if (xfs_has_rmapbt(mp)) { agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); @@ -573,7 +573,7 @@ xchk_agf( if (xfs_has_reflink(mp)) { agbno = be32_to_cpu(agf->agf_refcount_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_refcount_level); @@ -639,9 +639,8 @@ xchk_agfl_block( { struct xchk_agfl_info *sai = priv; struct xfs_scrub *sc = sai->sc; - xfs_agnumber_t agno = sc->sa.pag->pag_agno; - if (xfs_verify_agbno(mp, agno, agbno) && + if (xfs_verify_agbno(sc->sa.pag, agbno) && sai->nr_entries < sai->sz_entries) sai->entries[sai->nr_entries++] = agbno; else @@ -871,12 +870,12 @@ xchk_agi( /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); - if (eoag != xfs_ag_block_count(mp, agno)) + if (eoag != pag->block_count) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check btree roots and levels */ agbno = be32_to_cpu(agi->agi_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_level); @@ -885,7 +884,7 @@ xchk_agi( if (xfs_has_finobt(mp)) { agbno = be32_to_cpu(agi->agi_free_root); - if (!xfs_verify_agbno(mp, agno, agbno)) + if (!xfs_verify_agbno(pag, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_free_level); diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 10ac1118a595..ba012a5da0bf 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -106,7 +106,7 @@ xrep_agf_check_agfl_block( { struct xfs_scrub *sc = priv; - if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno)) + if (!xfs_verify_agbno(sc->sa.pag, agbno)) return -EFSCORRUPTED; return 0; } @@ -130,10 +130,7 @@ xrep_check_btree_root( struct xfs_scrub *sc, struct xrep_find_ag_btree *fab) { - struct xfs_mount *mp = sc->mp; - xfs_agnumber_t agno = sc->sm->sm_agno; - - return xfs_verify_agbno(mp, agno, fab->root) && + return xfs_verify_agbno(sc->sa.pag, fab->root) && fab->height <= fab->maxlevels; } diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 87518e1292f8..ab427b4d7fe0 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -93,8 +93,7 @@ xchk_allocbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { - struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; xfs_extlen_t len; @@ -102,8 +101,8 @@ xchk_allocbt_rec( len = be32_to_cpu(rec->alloc.ar_blockcount); if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + !xfs_verify_agbno(pag, bno) || + !xfs_verify_agbno(pag, bno + len - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_allocbt_xref(bs->sc, bno, len); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 00848ee542fb..b80a54be8634 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -104,13 +104,13 @@ xchk_iallocbt_chunk( xfs_extlen_t len) { struct xfs_mount *mp = bs->cur->bc_mp; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; bno = XFS_AGINO_TO_AGBNO(mp, agino); if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + !xfs_verify_agbno(pag, bno) || + !xfs_verify_agbno(pag, bno + len - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); xchk_iallocbt_chunk_xref(bs->sc, irec, agino, bno, len); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 2744eecdbaf0..3f82a1a1f390 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -332,9 +332,8 @@ xchk_refcountbt_rec( struct xchk_btree *bs, const union xfs_btree_rec *rec) { - struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; xfs_agblock_t bno; xfs_extlen_t len; xfs_nlink_t refcount; @@ -354,8 +353,8 @@ xchk_refcountbt_rec( /* Check the extent. */ bno &= ~XFS_REFC_COW_START; if (bno + len <= bno || - !xfs_verify_agbno(mp, agno, bno) || - !xfs_verify_agbno(mp, agno, bno + len - 1)) + !xfs_verify_agbno(pag, bno) || + !xfs_verify_agbno(pag, bno + len - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); if (refcount == 0) diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8dae0345c7df..229826b2e1c0 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -92,7 +92,7 @@ xchk_rmapbt_rec( { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; + struct xfs_perag *pag = bs->cur->bc_ag.pag; bool non_inode; bool is_unwritten; bool is_bmbt; @@ -121,8 +121,8 @@ xchk_rmapbt_rec( * Otherwise we must point somewhere past the static metadata * but before the end of the FS. Run the regular check. */ - if (!xfs_verify_agbno(mp, agno, irec.rm_startblock) || - !xfs_verify_agbno(mp, agno, irec.rm_startblock + + if (!xfs_verify_agbno(pag, irec.rm_startblock) || + !xfs_verify_agbno(pag, irec.rm_startblock + irec.rm_blockcount - 1)) xchk_btree_set_corrupt(bs->sc, bs->cur, 0); } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7be4d83d5884..5fe9af24dfcd 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -132,7 +132,7 @@ xfs_growfs_data_private( oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { - error = xfs_initialize_perag(mp, nagcount, &nagimax); + error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); if (error) return error; } else if (nagcount < oagcount) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 38aae3409c96..3e8c62c6c2b1 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3313,7 +3313,8 @@ xlog_do_recover( /* re-initialise in-core superblock and geometry structures */ mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); - error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, + &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); return error; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index daa8d29c46b4..f10c88cee116 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -778,7 +778,8 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks, + &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); goto out_free_dir; -- cgit v1.2.3 From 2d6ca8321c354e1cb6f6b1963c4f7bd053d2e272 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:13:10 +1000 Subject: xfs: Pre-calculate per-AG agino geometry There is a lot of overhead in functions like xfs_verify_agino() that repeatedly calculate the geometry limits of an AG. These can be pre-calculated as they are static and the verification context has a per-ag context it can quickly reference. In the case of xfs_verify_agino(), we now always have a perag context handy, so we can store the minimum and maximum agino values in the AG in the perag. This means we don't have to calculate it on every call and it can be inlined in callers if we move it to xfs_ag.h. xfs_verify_agino_or_null() gets the same perag treatment. xfs_agino_range() is moved to xfs_ag.c as it's not really a type function, and it's use is largely restricted as the first and last aginos can be grabbed straight from the perag in most cases. Note that we leave the original xfs_verify_agino in place in xfs_types.c as a static function as other callers in that file do not have per-ag contexts so still need to go the long way. It's been renamed to xfs_verify_agno_agino() to indicate it takes both an agno and an agino to differentiate it from new function. $ size --totals fs/xfs/built-in.a text data bss dec hex filename before 1482185 329588 572 1812345 1ba779 (TOTALS) after 1481937 329588 572 1812097 1ba681 (TOTALS) Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 39 ++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_ag.h | 30 +++++++++++++++++++++++ fs/xfs/libxfs/xfs_ialloc.c | 6 ++--- fs/xfs/libxfs/xfs_inode_buf.c | 5 ++-- fs/xfs/libxfs/xfs_types.c | 55 ++++--------------------------------------- fs/xfs/libxfs/xfs_types.h | 6 ----- fs/xfs/scrub/agheader.c | 6 ++--- fs/xfs/scrub/ialloc.c | 6 ++--- fs/xfs/scrub/repair.c | 9 +++---- fs/xfs/xfs_inode.c | 14 +++++------ 10 files changed, 95 insertions(+), 81 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 8f3e6ee85c34..d1a9163d4a48 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -225,6 +225,41 @@ xfs_ag_block_count( mp->m_sb.sb_dblocks); } +/* Calculate the first and last possible inode number in an AG. */ +static void +__xfs_agino_range( + struct xfs_mount *mp, + xfs_agblock_t eoag, + xfs_agino_t *first, + xfs_agino_t *last) +{ + xfs_agblock_t bno; + + /* + * Calculate the first inode, which will be in the first + * cluster-aligned block after the AGFL. + */ + bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); + *first = XFS_AGB_TO_AGINO(mp, bno); + + /* + * Calculate the last inode, which will be at the end of the + * last (aligned) cluster that can be allocated in the AG. + */ + bno = round_down(eoag, M_IGEO(mp)->cluster_align); + *last = XFS_AGB_TO_AGINO(mp, bno) - 1; +} + +void +xfs_agino_range( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t *first, + xfs_agino_t *last) +{ + return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); +} + int xfs_initialize_perag( struct xfs_mount *mp, @@ -302,6 +337,8 @@ xfs_initialize_perag( pag->block_count = __xfs_ag_block_count(mp, index, agcount, dblocks); pag->min_block = XFS_AGFL_BLOCK(mp); + __xfs_agino_range(mp, pag->block_count, &pag->agino_min, + &pag->agino_max); } index = xfs_set_inode_alloc(mp, agcount); @@ -968,6 +1005,8 @@ xfs_ag_extend_space( /* Update perag geometry */ pag->block_count = be32_to_cpu(agf->agf_length); + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); return 0; } diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 77640f1409fd..bb9e91bd38e2 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -70,6 +70,8 @@ struct xfs_perag { /* Precalculated geometry info */ xfs_agblock_t block_count; xfs_agblock_t min_block; + xfs_agino_t agino_min; + xfs_agino_t agino_max; #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ @@ -124,6 +126,8 @@ void xfs_perag_put(struct xfs_perag *pag); * Per-ag geometry infomation and validation */ xfs_agblock_t xfs_ag_block_count(struct xfs_mount *mp, xfs_agnumber_t agno); +void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t *first, xfs_agino_t *last); static inline bool xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) @@ -135,6 +139,32 @@ xfs_verify_agbno(struct xfs_perag *pag, xfs_agblock_t agbno) return true; } +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata. + */ +static inline bool +xfs_verify_agino(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino < pag->agino_min) + return false; + if (agino > pag->agino_max) + return false; + return true; +} + +/* + * Verify that an AG inode number pointer neither points outside the AG + * nor points at static metadata, or is NULLAGINO. + */ +static inline bool +xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino) +{ + if (agino == NULLAGINO) + return true; + return xfs_verify_agino(pag, agino); +} + /* * Perag iteration APIs */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 55757b990ac6..39ad3b7af502 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -105,7 +105,6 @@ xfs_inobt_get_rec( int *stat) { struct xfs_mount *mp = cur->bc_mp; - xfs_agnumber_t agno = cur->bc_ag.pag->pag_agno; union xfs_btree_rec *rec; int error; uint64_t realfree; @@ -116,7 +115,7 @@ xfs_inobt_get_rec( xfs_inobt_btrec_to_irec(mp, rec, irec); - if (!xfs_verify_agino(mp, agno, irec->ir_startino)) + if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) goto out_bad_rec; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || irec->ir_count > XFS_INODES_PER_CHUNK) @@ -137,7 +136,8 @@ xfs_inobt_get_rec( out_bad_rec: xfs_warn(mp, "%s Inode BTree record corruption in AG %d detected!", - cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno); + cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", + cur->bc_ag.pag->pag_agno); xfs_warn(mp, "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x", irec->ir_startino, irec->ir_count, irec->ir_freecount, diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 3b1b63f9d886..3a12bd3c7c97 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_ag.h" #include "xfs_inode.h" #include "xfs_errortag.h" #include "xfs_error.h" @@ -41,14 +42,12 @@ xfs_inode_buf_verify( bool readahead) { struct xfs_mount *mp = bp->b_mount; - xfs_agnumber_t agno; int i; int ni; /* * Validate the magic number and version of every inode in the buffer */ - agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { struct xfs_dinode *dip; @@ -59,7 +58,7 @@ xfs_inode_buf_verify( unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && xfs_dinode_good_version(mp, dip->di_version) && - xfs_verify_agino_or_null(mp, agno, unlinked_ino); + xfs_verify_agino_or_null(bp->b_pag, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index b3c6b0274e95..5c2765934732 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -73,40 +73,12 @@ xfs_verify_fsbext( XFS_FSB_TO_AGNO(mp, fsbno + len - 1); } -/* Calculate the first and last possible inode number in an AG. */ -inline void -xfs_agino_range( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t *first, - xfs_agino_t *last) -{ - xfs_agblock_t bno; - xfs_agblock_t eoag; - - eoag = xfs_ag_block_count(mp, agno); - - /* - * Calculate the first inode, which will be in the first - * cluster-aligned block after the AGFL. - */ - bno = round_up(XFS_AGFL_BLOCK(mp) + 1, M_IGEO(mp)->cluster_align); - *first = XFS_AGB_TO_AGINO(mp, bno); - - /* - * Calculate the last inode, which will be at the end of the - * last (aligned) cluster that can be allocated in the AG. - */ - bno = round_down(eoag, M_IGEO(mp)->cluster_align); - *last = XFS_AGB_TO_AGINO(mp, bno) - 1; -} - /* * Verify that an AG inode number pointer neither points outside the AG * nor points at static metadata. */ -inline bool -xfs_verify_agino( +static inline bool +xfs_verify_agno_agino( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino) @@ -118,19 +90,6 @@ xfs_verify_agino( return agino >= first && agino <= last; } -/* - * Verify that an AG inode number pointer neither points outside the AG - * nor points at static metadata, or is NULLAGINO. - */ -bool -xfs_verify_agino_or_null( - struct xfs_mount *mp, - xfs_agnumber_t agno, - xfs_agino_t agino) -{ - return agino == NULLAGINO || xfs_verify_agino(mp, agno, agino); -} - /* * Verify that an FS inode number pointer neither points outside the * filesystem nor points at static AG metadata. @@ -147,7 +106,7 @@ xfs_verify_ino( return false; if (XFS_AGINO_TO_INO(mp, agno, agino) != ino) return false; - return xfs_verify_agino(mp, agno, agino); + return xfs_verify_agno_agino(mp, agno, agino); } /* Is this an internal inode number? */ @@ -217,12 +176,8 @@ xfs_icount_range( /* root, rtbitmap, rtsum all live in the first chunk */ *min = XFS_INODES_PER_CHUNK; - for_each_perag(mp, agno, pag) { - xfs_agino_t first, last; - - xfs_agino_range(mp, agno, &first, &last); - nr_inos += last - first + 1; - } + for_each_perag(mp, agno, pag) + nr_inos += pag->agino_max - pag->agino_min + 1; *max = nr_inos; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index ccf61afb959d..a6b7d98cf68f 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -183,12 +183,6 @@ bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); bool xfs_verify_fsbext(struct xfs_mount *mp, xfs_fsblock_t fsbno, xfs_fsblock_t len); -void xfs_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t *first, xfs_agino_t *last); -bool xfs_verify_agino(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); -bool xfs_verify_agino_or_null(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t agino); bool xfs_verify_ino(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_internal_inum(struct xfs_mount *mp, xfs_ino_t ino); bool xfs_verify_dir_ino(struct xfs_mount *mp, xfs_ino_t ino); diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 181bba5f9b8f..b7b838bd4ba4 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -901,17 +901,17 @@ xchk_agi( /* Check inode pointers */ agino = be32_to_cpu(agi->agi_newino); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); agino = be32_to_cpu(agi->agi_dirino); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Check unlinked inode buckets */ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) { agino = be32_to_cpu(agi->agi_unlinked[i]); - if (!xfs_verify_agino_or_null(mp, agno, agino)) + if (!xfs_verify_agino_or_null(pag, agino)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index b80a54be8634..e1026e07bf94 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -421,10 +421,10 @@ xchk_iallocbt_rec( const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; + struct xfs_perag *pag = bs->cur->bc_ag.pag; struct xchk_iallocbt *iabt = bs->private; struct xfs_inobt_rec_incore irec; uint64_t holes; - xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; xfs_agino_t agino; xfs_extlen_t len; int holecount; @@ -446,8 +446,8 @@ xchk_iallocbt_rec( agino = irec.ir_startino; /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(mp, agno, agino) || - !xfs_verify_agino(mp, agno, agino + XFS_INODES_PER_CHUNK - 1)) { + if (!xfs_verify_agino(pag, agino) || + !xfs_verify_agino(pag, agino + XFS_INODES_PER_CHUNK - 1)) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); goto out; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index c983b76e070f..d51d82243fd3 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -220,16 +220,13 @@ xrep_calc_ag_resblks( usedlen = aglen - freelen; xfs_buf_relse(bp); } - xfs_perag_put(pag); /* If the icount is impossible, make some worst-case assumptions. */ if (icount == NULLAGINO || - !xfs_verify_agino(mp, sm->sm_agno, icount)) { - xfs_agino_t first, last; - - xfs_agino_range(mp, sm->sm_agno, &first, &last); - icount = last - first + 1; + !xfs_verify_agino(pag, icount)) { + icount = pag->agino_max - pag->agino_min + 1; } + xfs_perag_put(pag); /* If the block counts are impossible, make worst-case assumptions. */ if (aglen == NULLAGBLOCK || diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 29dfc997420f..482e1ee2d669 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2008,7 +2008,7 @@ xfs_iunlink_update_bucket( xfs_agino_t old_value; int offset; - ASSERT(xfs_verify_agino_or_null(tp->t_mountp, pag->pag_agno, new_agino)); + ASSERT(xfs_verify_agino_or_null(pag, new_agino)); old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, @@ -2045,7 +2045,7 @@ xfs_iunlink_update_dinode( struct xfs_mount *mp = tp->t_mountp; int offset; - ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); + ASSERT(xfs_verify_agino_or_null(pag, next_agino)); trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino, be32_to_cpu(dip->di_next_unlinked), next_agino); @@ -2075,7 +2075,7 @@ xfs_iunlink_update_inode( xfs_agino_t old_value; int error; - ASSERT(xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)); + ASSERT(xfs_verify_agino_or_null(pag, next_agino)); error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); if (error) @@ -2084,7 +2084,7 @@ xfs_iunlink_update_inode( /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); - if (!xfs_verify_agino_or_null(mp, pag->pag_agno, old_value)) { + if (!xfs_verify_agino_or_null(pag, old_value)) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; @@ -2155,7 +2155,7 @@ xfs_iunlink( */ next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); if (next_agino == agino || - !xfs_verify_agino_or_null(mp, pag->pag_agno, next_agino)) { + !xfs_verify_agino_or_null(pag, next_agino)) { xfs_buf_mark_corrupt(agibp); error = -EFSCORRUPTED; goto out; @@ -2291,7 +2291,7 @@ xfs_iunlink_map_prev( * Make sure this pointer is valid and isn't an obvious * infinite loop. */ - if (!xfs_verify_agino(mp, pag->pag_agno, unlinked_agino) || + if (!xfs_verify_agino(pag, unlinked_agino) || next_agino == unlinked_agino) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -2338,7 +2338,7 @@ xfs_iunlink_remove( * go on. Make sure the head pointer isn't garbage. */ head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); - if (!xfs_verify_agino(mp, pag->pag_agno, head_agino)) { + if (!xfs_verify_agino(pag, head_agino)) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi, sizeof(*agi)); return -EFSCORRUPTED; -- cgit v1.2.3 From 3829c9a10fc7da40194ec9860df8c557c2b86ed8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:13:17 +1000 Subject: xfs: replace xfs_ag_block_count() with perag accesses Many of the places that call xfs_ag_block_count() have a perag available. These places can just read pag->block_count directly instead of calculating the AG block count from first principles. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc_btree.c | 10 +++++----- fs/xfs/scrub/agheader_repair.c | 6 ++---- fs/xfs/scrub/repair.c | 8 ++++---- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index aa4367a0a0de..2e0ff99d9f0b 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -683,10 +683,10 @@ xfs_inobt_rec_check_count( static xfs_extlen_t xfs_inobt_max_size( - struct xfs_mount *mp, - xfs_agnumber_t agno) + struct xfs_perag *pag) { - xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno); + struct xfs_mount *mp = pag->pag_mount; + xfs_agblock_t agblocks = pag->block_count; /* Bail out if we're uninitialized, which can happen in mkfs. */ if (M_IGEO(mp)->inobt_mxr[0] == 0) @@ -698,7 +698,7 @@ xfs_inobt_max_size( * expansion. We therefore can pretend the space isn't there. */ if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == agno) + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, @@ -800,7 +800,7 @@ xfs_finobt_calc_reserves( if (error) return error; - *ask += xfs_inobt_max_size(mp, pag->pag_agno); + *ask += xfs_inobt_max_size(pag); *used += tree_len; return 0; } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index ba012a5da0bf..1b0b4e243f77 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -198,8 +198,7 @@ xrep_agf_init_header( agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, - sc->sa.pag->pag_agno)); + agf->agf_length = cpu_to_be32(sc->sa.pag->block_count); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; @@ -777,8 +776,7 @@ xrep_agi_init_header( agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno); - agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, - sc->sa.pag->pag_agno)); + agi->agi_length = cpu_to_be32(sc->sa.pag->block_count); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); if (xfs_has_crc(mp)) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index d51d82243fd3..a02ec8fbc8ac 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -209,7 +209,7 @@ xrep_calc_ag_resblks( /* Now grab the block counters from the AGF. */ error = xfs_alloc_read_agf(pag, NULL, 0, &bp); if (error) { - aglen = xfs_ag_block_count(mp, sm->sm_agno); + aglen = pag->block_count; freelen = aglen; usedlen = aglen; } else { @@ -226,16 +226,16 @@ xrep_calc_ag_resblks( !xfs_verify_agino(pag, icount)) { icount = pag->agino_max - pag->agino_min + 1; } - xfs_perag_put(pag); /* If the block counts are impossible, make worst-case assumptions. */ if (aglen == NULLAGBLOCK || - aglen != xfs_ag_block_count(mp, sm->sm_agno) || + aglen != pag->block_count || freelen >= aglen) { - aglen = xfs_ag_block_count(mp, sm->sm_agno); + aglen = pag->block_count; freelen = aglen; usedlen = aglen; } + xfs_perag_put(pag); trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, freelen, usedlen); -- cgit v1.2.3 From 36029dee382a20cf515494376ce9f0d5949944eb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 19:13:21 +1000 Subject: xfs: make is_log_ag() a first class helper We check if an ag contains the log in many places, so make this a first class XFS helper by lifting it to fs/xfs/libxfs/xfs_ag.h and renaming it xfs_ag_contains_log(). The convert all the places that check if the AG contains the log to use this helper. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag.c | 12 +++--------- fs/xfs/libxfs/xfs_ag.h | 7 +++++++ fs/xfs/libxfs/xfs_ialloc.c | 3 +-- fs/xfs/libxfs/xfs_ialloc_btree.c | 3 +-- fs/xfs/libxfs/xfs_refcount_btree.c | 3 +-- fs/xfs/libxfs/xfs_rmap_btree.c | 3 +-- fs/xfs/scrub/health.c | 2 ++ fs/xfs/scrub/refcount.c | 2 ++ 8 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index d1a9163d4a48..71f5dae7ad6c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -390,12 +390,6 @@ xfs_get_aghdr_buf( return 0; } -static inline bool is_log_ag(struct xfs_mount *mp, struct aghdr_init_data *id) -{ - return mp->m_sb.sb_logstart > 0 && - id->agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); -} - /* * Generic btree root block init function */ @@ -421,7 +415,7 @@ xfs_freesp_init_recs( arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { struct xfs_alloc_rec *nrec; xfs_agblock_t start = XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart); @@ -548,7 +542,7 @@ xfs_rmaproot_init( } /* account for the log space */ - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { rrec = XFS_RMAP_REC_ADDR(block, be16_to_cpu(block->bb_numrecs) + 1); rrec->rm_startblock = cpu_to_be32( @@ -619,7 +613,7 @@ xfs_agfblock_init( agf->agf_refcount_blocks = cpu_to_be32(1); } - if (is_log_ag(mp, id)) { + if (xfs_ag_contains_log(mp, id->agno)) { int64_t logblocks = mp->m_sb.sb_logblocks; be32_add_cpu(&agf->agf_freeblks, -logblocks); diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index bb9e91bd38e2..75f7c10c110a 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -165,6 +165,13 @@ xfs_verify_agino_or_null(struct xfs_perag *pag, xfs_agino_t agino) return xfs_verify_agino(pag, agino); } +static inline bool +xfs_ag_contains_log(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + return mp->m_sb.sb_logstart > 0 && + agno == XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart); +} + /* * Perag iteration APIs */ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 39ad3b7af502..6cdfd64bc56b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2897,8 +2897,7 @@ xfs_ialloc_calc_rootino( * allocation group, or very odd geometries created by old mkfs * versions on very small filesystems. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + if (xfs_ag_contains_log(mp, 0)) first_bno += mp->m_sb.sb_logblocks; /* diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 2e0ff99d9f0b..8c83e265770c 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -697,8 +697,7 @@ xfs_inobt_max_size( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 1063234df34a..316c1ec0c3c2 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -507,8 +507,7 @@ xfs_refcountbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; *ask += xfs_refcountbt_max_size(mp, agblocks); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 1ae14d0c831c..7f83f62e51e0 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -666,8 +666,7 @@ xfs_rmapbt_calc_reserves( * never be available for the kinds of things that would require btree * expansion. We therefore can pretend the space isn't there. */ - if (mp->m_sb.sb_logstart && - XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno) + if (xfs_ag_contains_log(mp, pag->pag_agno)) agblocks -= mp->m_sb.sb_logblocks; /* Reserve 1% of the AG or enough for 1 block per record. */ diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 2e61df3bca83..aa65ec88a0c0 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 3f82a1a1f390..c68b767dc08f 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -13,6 +13,8 @@ #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" #include "xfs_ag.h" /* -- cgit v1.2.3 From 85c73bf726e41be276bcad3325d9a8aef10be289 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 7 Jul 2022 22:05:18 +1000 Subject: xfs: rework xfs_buf_incore() API Make it consistent with the other buffer APIs to return a error and the buffer is placed in a parameter. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr_remote.c | 15 ++++++++++----- fs/xfs/scrub/repair.c | 15 +++++++++------ fs/xfs/xfs_buf.c | 19 ++----------------- fs/xfs/xfs_buf.h | 20 ++++++++++++++++---- fs/xfs/xfs_qm.c | 9 ++++----- 5 files changed, 41 insertions(+), 37 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 7298c148f848..d440393b40eb 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -543,6 +543,7 @@ xfs_attr_rmtval_stale( { struct xfs_mount *mp = ip->i_mount; struct xfs_buf *bp; + int error; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -550,14 +551,18 @@ xfs_attr_rmtval_stale( XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) return -EFSCORRUPTED; - bp = xfs_buf_incore(mp->m_ddev_targp, + error = xfs_buf_incore(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map->br_startblock), - XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags); - if (bp) { - xfs_buf_stale(bp); - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, map->br_blockcount), + incore_flags, &bp); + if (error) { + if (error == -ENOENT) + return 0; + return error; } + xfs_buf_stale(bp); + xfs_buf_relse(bp); return 0; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1e7b6b209ee8..5e7428782571 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -457,16 +457,19 @@ xrep_invalidate_blocks( * assume it's owned by someone else. */ for_each_xbitmap_block(fsbno, bmr, n, bitmap) { + int error; + /* Skip AG headers and post-EOFS blocks */ if (!xfs_verify_fsbno(sc->mp, fsbno)) continue; - bp = xfs_buf_incore(sc->mp->m_ddev_targp, + error = xfs_buf_incore(sc->mp->m_ddev_targp, XFS_FSB_TO_DADDR(sc->mp, fsbno), - XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK); - if (bp) { - xfs_trans_bjoin(sc->tp, bp); - xfs_trans_binval(sc->tp, bp); - } + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp); + if (error) + continue; + + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); } return 0; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bf4e60871068..143e1c70df5d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -616,23 +616,6 @@ found: return 0; } -struct xfs_buf * -xfs_buf_incore( - struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) -{ - struct xfs_buf *bp; - int error; - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - - error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); - if (error) - return NULL; - return bp; -} - /* * Assembles a buffer covering the specified range. The code is optimised for * cache hits, as metadata intensive workloads will see 3 orders of magnitude @@ -656,6 +639,8 @@ xfs_buf_get_map( goto found; if (error != -ENOENT) return error; + if (flags & XBF_INCORE) + return -ENOENT; error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); if (error) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1ee3056ff9cf..58e9034d51bd 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -42,9 +42,11 @@ struct xfs_buf; #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ /* flags used only as arguments to access routines */ +#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ #define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ + typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -63,6 +65,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ + { XBF_INCORE, "INCORE" }, \ { XBF_TRYLOCK, "TRYLOCK" }, \ { XBF_UNMAPPED, "UNMAPPED" } @@ -196,10 +199,6 @@ struct xfs_buf { }; /* Finding and Reading Buffers */ -struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, - xfs_daddr_t blkno, size_t numblks, - xfs_buf_flags_t flags); - int xfs_buf_get_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp); int xfs_buf_read_map(struct xfs_buftarg *target, struct xfs_buf_map *map, @@ -209,6 +208,19 @@ void xfs_buf_readahead_map(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, const struct xfs_buf_ops *ops); +static inline int +xfs_buf_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + + return xfs_buf_get_map(target, &map, 1, XBF_INCORE | flags, bpp); +} + static inline int xfs_buf_get( struct xfs_buftarg *target, diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index abf08bbf34a9..3517a6be8dad 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1229,12 +1229,11 @@ xfs_qm_flush_one( */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ - bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, 0); - if (!bp) { - error = -EINVAL; + error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) goto out_unlock; - } + xfs_buf_unlock(bp); xfs_buf_delwri_pushbuf(bp, buffer_list); -- cgit v1.2.3 From 70b589a37e1aba892c1e5d41957b0042f9eb031b Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 9 Jul 2022 10:56:02 -0700 Subject: xfs: add selinux labels to whiteout inodes We got a report that "renameat2() with flags=RENAME_WHITEOUT doesn't apply an SELinux label on xfs" as it does on other filesystems (for example, ext4 and tmpfs.) While I'm not quite sure how labels may interact w/ whiteout files, leaving them as unlabeled seems inconsistent at best. Now that xfs_init_security is not static, rename it to xfs_inode_init_security per dchinner's suggestion. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 14 +++++++++++++- fs/xfs/xfs_iops.c | 11 +++++------ fs/xfs/xfs_iops.h | 3 +++ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 482e1ee2d669..296e253bcfcd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3032,10 +3032,12 @@ out_trans_abort: static int xfs_rename_alloc_whiteout( struct user_namespace *mnt_userns, + struct xfs_name *src_name, struct xfs_inode *dp, struct xfs_inode **wip) { struct xfs_inode *tmpfile; + struct qstr name; int error; error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, @@ -3043,6 +3045,15 @@ xfs_rename_alloc_whiteout( if (error) return error; + name.name = src_name->name; + name.len = src_name->len; + error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); + if (error) { + xfs_finish_inode_setup(tmpfile); + xfs_irele(tmpfile); + return error; + } + /* * Prepare the tmpfile inode as if it were created through the VFS. * Complete the inode setup and flag it as linkable. nlink is already @@ -3093,7 +3104,8 @@ xfs_rename( * appropriately. */ if (flags & RENAME_WHITEOUT) { - error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip); + error = xfs_rename_alloc_whiteout(mnt_userns, src_name, + target_dp, &wip); if (error) return error; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 29f5b8b8aca6..6720b60f88cf 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -75,9 +75,8 @@ xfs_initxattrs( * these attrs can be journalled at inode creation time (along with the * inode, of course, such that log replay can't cause these to be lost). */ - -STATIC int -xfs_init_security( +int +xfs_inode_init_security( struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -122,7 +121,7 @@ xfs_cleanup_inode( /* Oh, the horror. * If we can't add the ACL or we fail in - * xfs_init_security we must back out. + * xfs_inode_init_security we must back out. * ENOSPC can hit here, among other things. */ xfs_dentry_to_name(&teardown, dentry); @@ -208,7 +207,7 @@ xfs_generic_create( inode = VFS_I(ip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; @@ -424,7 +423,7 @@ xfs_vn_symlink( inode = VFS_I(cip); - error = xfs_init_security(inode, dir, &dentry->d_name); + error = xfs_inode_init_security(inode, dir, &dentry->d_name); if (unlikely(error)) goto out_cleanup_inode; diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 278949056048..cb5fc68c9ea0 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -17,4 +17,7 @@ extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); int xfs_vn_setattr_size(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *vap); +int xfs_inode_init_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); + #endif /* __XFS_IOPS_H__ */ -- cgit v1.2.3 From 0f38063d7a38015a47ca1488406bf21e0effe80e Mon Sep 17 00:00:00 2001 From: Andrey Strachuk Date: Sat, 9 Jul 2022 10:56:02 -0700 Subject: xfs: removed useless condition in function xfs_attr_node_get At line 1561, variable "state" is being compared with NULL every loop iteration. ------------------------------------------------------------------- 1561 for (i = 0; state != NULL && i < state->path.active; i++) { 1562 xfs_trans_brelse(args->trans, state->path.blk[i].bp); 1563 state->path.blk[i].bp = NULL; 1564 } ------------------------------------------------------------------- However, it cannot be NULL. ---------------------------------------- 1546 state = xfs_da_state_alloc(args); ---------------------------------------- xfs_da_state_alloc calls kmem_cache_zalloc. kmem_cache_zalloc is called with __GFP_NOFAIL flag and, therefore, it cannot return NULL. -------------------------------------------------------------------------- struct xfs_da_state * xfs_da_state_alloc( struct xfs_da_args *args) { struct xfs_da_state *state; state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL); state->args = args; state->mp = args->dp->i_mount; return state; } -------------------------------------------------------------------------- Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Andrey Strachuk Fixes: 4d0cdd2bb8f0 ("xfs: clean up xfs_attr_node_hasname") Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 224649a76cbb..6b8857e53add 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1558,7 +1558,7 @@ xfs_attr_node_get( * If not in a transaction, we have to release all the buffers. */ out_release: - for (i = 0; state != NULL && i < state->path.active; i++) { + for (i = 0; i < state->path.active; i++) { xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } -- cgit v1.2.3 From 732436ef916b4f338d672ea56accfdb11e8d0732 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 9 Jul 2022 10:56:05 -0700 Subject: xfs: convert XFS_IFORK_PTR to a static inline helper We're about to make this logic do a bit more, so convert the macro to a static inline function for better typechecking and fewer shouty macros. No functional changes here. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 68 +++++++++++++++++++------------------- fs/xfs/libxfs/xfs_bmap_btree.c | 8 ++--- fs/xfs/libxfs/xfs_btree.c | 4 +-- fs/xfs/libxfs/xfs_dir2_block.c | 2 +- fs/xfs/libxfs/xfs_dir2_sf.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.c | 16 ++++----- fs/xfs/libxfs/xfs_inode_fork.h | 6 ---- fs/xfs/libxfs/xfs_symlink_remote.c | 2 +- fs/xfs/scrub/bmap.c | 14 ++++---- fs/xfs/scrub/dabtree.c | 2 +- fs/xfs/scrub/dir.c | 2 +- fs/xfs/scrub/quota.c | 2 +- fs/xfs/scrub/symlink.c | 2 +- fs/xfs/xfs_bmap_util.c | 4 +-- fs/xfs/xfs_dir2_readdir.c | 2 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 6 ++-- fs/xfs/xfs_inode.h | 18 ++++++++++ fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_iomap.c | 4 +-- fs/xfs/xfs_qm.c | 2 +- fs/xfs/xfs_reflink.c | 6 ++-- 23 files changed, 95 insertions(+), 83 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 8f47396f8dd2..d3670877143f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1056,7 +1056,7 @@ xfs_attr_shortform_verify( int64_t size; ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL); - ifp = XFS_IFORK_PTR(ip, XFS_ATTR_FORK); + ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 88828fcf0453..3569a67562ce 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -128,7 +128,7 @@ xfs_bmbt_lookup_first( */ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_EXTENTS && @@ -140,7 +140,7 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) */ static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); return whichfork != XFS_COW_FORK && ifp->if_format == XFS_DINODE_FMT_BTREE && @@ -319,7 +319,7 @@ xfs_bmap_check_leaf_extents( int whichfork) /* data or attr fork */ { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; /* current btree block */ xfs_fsblock_t bno; /* block # of "block" */ struct xfs_buf *bp; /* buffer for "block" */ @@ -538,7 +538,7 @@ xfs_bmap_btree_to_extents( int *logflagsp, /* inode logging flags */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_block *rblock = ifp->if_broot; struct xfs_btree_block *cblock;/* child btree block */ @@ -616,7 +616,7 @@ xfs_bmap_extents_to_btree( mp = ip->i_mount; ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_EXTENTS); /* @@ -745,7 +745,7 @@ xfs_bmap_local_to_extents_empty( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(whichfork != XFS_COW_FORK); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); @@ -785,7 +785,7 @@ xfs_bmap_local_to_extents( * So sending the data fork of a regular inode is invalid. */ ASSERT(!(S_ISREG(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { @@ -1116,7 +1116,7 @@ xfs_iread_bmbt_block( xfs_extnum_t num_recs; xfs_extnum_t j; int whichfork = cur->bc_ino.whichfork; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); block = xfs_btree_get_block(cur, level, &bp); @@ -1164,7 +1164,7 @@ xfs_iread_extents( int whichfork) { struct xfs_iread_state ir; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur; int error; @@ -1208,7 +1208,7 @@ xfs_bmap_first_unused( xfs_fileoff_t *first_unused, /* unused block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; xfs_fileoff_t lastaddr = 0; @@ -1255,7 +1255,7 @@ xfs_bmap_last_before( xfs_fileoff_t *last_block, /* last block */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; int error; @@ -1289,7 +1289,7 @@ xfs_bmap_last_extent( struct xfs_bmbt_irec *rec, int *is_empty) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; int error; @@ -1355,7 +1355,7 @@ xfs_bmap_last_offset( xfs_fileoff_t *last_block, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec rec; int is_empty; int error; @@ -1389,7 +1389,7 @@ xfs_bmap_add_extent_delay_real( int whichfork) { struct xfs_mount *mp = bma->ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ int i; /* temp state */ @@ -1955,7 +1955,7 @@ xfs_bmap_add_extent_unwritten_real( *logflagsp = 0; cur = *curp; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(!isnullstartblock(new->br_startblock)); @@ -2480,7 +2480,7 @@ xfs_bmap_add_extent_hole_delay( uint32_t state = xfs_bmap_fork_to_state(whichfork); xfs_filblks_t temp; /* temp for indirect calculations */ - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(isnullstartblock(new->br_startblock)); /* @@ -2616,7 +2616,7 @@ xfs_bmap_add_extent_hole_real( int *logflagsp, uint32_t flags) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; struct xfs_btree_cur *cur = *curp; int error; /* error return value */ @@ -3867,7 +3867,7 @@ xfs_bmapi_read( { struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; xfs_fileoff_t obno; xfs_fileoff_t end; @@ -3960,7 +3960,7 @@ xfs_bmapi_reserve_delalloc( int eof) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_extlen_t alen; xfs_extlen_t indlen; int error; @@ -4087,7 +4087,7 @@ xfs_bmapi_allocate( { struct xfs_mount *mp = bma->ip->i_mount; int whichfork = xfs_bmapi_whichfork(bma->flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4186,7 +4186,7 @@ xfs_bmapi_convert_unwritten( uint32_t flags) { int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); int tmp_logflags = 0; int error; @@ -4263,7 +4263,7 @@ xfs_bmapi_minleft( struct xfs_inode *ip, int fork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, fork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, fork); if (tp && tp->t_firstblock != NULLFSBLOCK) return 0; @@ -4284,7 +4284,7 @@ xfs_bmapi_finish( int whichfork, int error) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(bma->ip, whichfork); if ((bma->logflags & xfs_ilog_fext(whichfork)) && ifp->if_format != XFS_DINODE_FMT_EXTENTS) @@ -4323,7 +4323,7 @@ xfs_bmapi_write( }; struct xfs_mount *mp = ip->i_mount; int whichfork = xfs_bmapi_whichfork(flags); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t end; /* end of mapped file region */ bool eof = false; /* after the end of extents */ int error; /* error return */ @@ -4504,7 +4504,7 @@ xfs_bmapi_convert_delalloc( struct iomap *iomap, unsigned int *seq) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; @@ -4641,7 +4641,7 @@ xfs_bmapi_remap( int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -4798,7 +4798,7 @@ xfs_bmap_del_extent_delay( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; int64_t da_old, da_new, da_diff = 0; xfs_fileoff_t del_endoff, got_endoff; @@ -4925,7 +4925,7 @@ xfs_bmap_del_extent_cow( struct xfs_bmbt_irec *del) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec new; xfs_fileoff_t del_endoff, got_endoff; uint32_t state = BMAP_COWFORK; @@ -5023,7 +5023,7 @@ xfs_bmap_del_extent_real( mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); ASSERT(del->br_blockcount > 0); xfs_iext_get_extent(ifp, icur, &got); ASSERT(got.br_startoff <= del->br_startoff); @@ -5289,7 +5289,7 @@ __xfs_bunmapi( whichfork = xfs_bmapi_whichfork(flags); ASSERT(whichfork != XFS_COW_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; if (xfs_is_shutdown(mp)) @@ -5630,7 +5630,7 @@ xfs_bmse_merge( struct xfs_btree_cur *cur, int *logflags) /* output */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; @@ -5751,7 +5751,7 @@ xfs_bmap_collapse_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, prev; struct xfs_iext_cursor icur; @@ -5866,7 +5866,7 @@ xfs_bmap_insert_extents( { int whichfork = XFS_DATA_FORK; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got, next; struct xfs_iext_cursor icur; @@ -5966,7 +5966,7 @@ xfs_bmap_split_extent( xfs_fileoff_t split_fsb) { int whichfork = XFS_DATA_FORK; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 2b77d45c215f..986ffca7f74d 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -304,7 +304,7 @@ xfs_bmbt_get_minrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + ifp = xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, @@ -322,7 +322,7 @@ xfs_bmbt_get_maxrecs( if (level == cur->bc_nlevels - 1) { struct xfs_ifork *ifp; - ifp = XFS_IFORK_PTR(cur->bc_ino.ip, + ifp = xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); return xfs_bmbt_maxrecs(cur->bc_mp, @@ -550,7 +550,7 @@ xfs_bmbt_init_cursor( struct xfs_inode *ip, /* inode owning the btree */ int whichfork) /* data or attr fork */ { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); @@ -664,7 +664,7 @@ xfs_bmbt_change_owner( ASSERT(tp || buffer_list); ASSERT(!(tp && buffer_list)); - ASSERT(XFS_IFORK_PTR(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); + ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE); cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 06ab364d2de3..4c16c8c31fcb 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -722,7 +722,7 @@ xfs_btree_ifork_ptr( if (cur->bc_flags & XFS_BTREE_STAGING) return cur->bc_ino.ifake->if_fork; - return XFS_IFORK_PTR(cur->bc_ino.ip, cur->bc_ino.whichfork); + return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork); } /* @@ -3556,7 +3556,7 @@ xfs_btree_kill_iroot( { int whichfork = cur->bc_ino.whichfork; struct xfs_inode *ip = cur->bc_ino.ip; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_block *block; struct xfs_btree_block *cblock; union xfs_btree_key *kp; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index df0869bba275..f5e45aa28c91 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1071,7 +1071,7 @@ xfs_dir2_sf_to_block( struct xfs_trans *tp = args->trans; struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_da_geometry *geo = args->geo; xfs_dir2_db_t blkno; /* dir-relative block # (0) */ xfs_dir2_data_hdr_t *hdr; /* block header */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 5a97a87eaa20..a41e2624e115 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -710,7 +710,7 @@ xfs_dir2_sf_verify( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 1a4cdf550f6d..40016b8b00ee 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -35,7 +35,7 @@ xfs_init_local_fork( const void *data, int64_t size) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int mem_size = size; bool zero_terminate; @@ -102,7 +102,7 @@ xfs_iformat_extents( int whichfork) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int state = xfs_bmap_fork_to_state(whichfork); xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork); int size = nex * sizeof(xfs_bmbt_rec_t); @@ -173,7 +173,7 @@ xfs_iformat_btree( int size; int level; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); size = XFS_BMAP_BROOT_SPACE(mp, dfp); nrecs = be16_to_cpu(dfp->bb_numrecs); @@ -370,7 +370,7 @@ xfs_iroot_realloc( return; } - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); if (rec_diff > 0) { /* * If there wasn't any memory allocated before, just @@ -480,7 +480,7 @@ xfs_idata_realloc( int64_t byte_diff, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); int64_t new_size = ifp->if_bytes + byte_diff; ASSERT(new_size >= 0); @@ -539,7 +539,7 @@ xfs_iextents_copy( int whichfork) { int state = xfs_bmap_fork_to_state(whichfork); - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_iext_cursor icur; struct xfs_bmbt_irec rec; int64_t copied = 0; @@ -591,7 +591,7 @@ xfs_iflush_fork( if (!iip) return; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); /* * This can happen if we gave up in iformat in an error path, * for the attribute fork. @@ -731,7 +731,7 @@ xfs_iext_count_may_overflow( int whichfork, int nr_to_add) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); uint64_t max_exts; uint64_t nr_exts; diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 4f68c1f20beb..50c5fae4e35d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -81,12 +81,6 @@ struct xfs_ifork { #define XFS_IFORK_Q(ip) ((ip)->i_forkoff != 0) #define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3)) -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - &(ip)->i_df : \ - ((w) == XFS_ATTR_FORK ? \ - (ip)->i_afp : \ - (ip)->i_cowfp)) #define XFS_IFORK_DSIZE(ip) \ (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) #define XFS_IFORK_ASIZE(ip) \ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 8b9bd178a487..bdc777b9ec4a 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -204,7 +204,7 @@ xfs_failaddr_t xfs_symlink_shortform_verify( struct xfs_inode *ip) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); char *sfp = (char *)ifp->if_u1.if_data; int size = ifp->if_bytes; char *endp = sfp + size; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 9353fd060525..f0b9cb6506fd 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -377,7 +377,7 @@ xchk_bmapbt_rec( struct xfs_inode *ip = bs->cur->bc_ino.ip; struct xfs_buf *bp = NULL; struct xfs_btree_block *block; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, info->whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, info->whichfork); uint64_t owner; int i; @@ -426,7 +426,7 @@ xchk_bmap_btree( struct xchk_bmap_info *info) { struct xfs_owner_info oinfo; - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; struct xfs_btree_cur *cur; @@ -478,7 +478,7 @@ xchk_bmap_check_rmap( return 0; /* Now look up the bmbt record. */ - ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork); + ifp = xfs_ifork_ptr(sc->ip, sbcri->whichfork); if (!ifp) { xchk_fblock_set_corrupt(sc, sbcri->whichfork, rec->rm_offset); @@ -563,7 +563,7 @@ xchk_bmap_check_rmaps( struct xfs_scrub *sc, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); struct xfs_perag *pag; xfs_agnumber_t agno; bool zero_size; @@ -578,7 +578,7 @@ xchk_bmap_check_rmaps( if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) return 0; - ASSERT(XFS_IFORK_PTR(sc->ip, whichfork) != NULL); + ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); /* * Only do this for complex maps that are in btree format, or for @@ -624,7 +624,7 @@ xchk_bmap( struct xchk_bmap_info info = { NULL }; struct xfs_mount *mp = sc->mp; struct xfs_inode *ip = sc->ip; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t endoff; struct xfs_iext_cursor icur; int error = 0; @@ -689,7 +689,7 @@ xchk_bmap( /* Scrub extent records. */ info.lastoff = 0; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index b962cfbbd92b..84fe3d33d699 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -482,7 +482,7 @@ xchk_da_btree( int error; /* Skip short format data structures; no btree to scan. */ - if (!xfs_ifork_has_extents(XFS_IFORK_PTR(sc->ip, whichfork))) + if (!xfs_ifork_has_extents(xfs_ifork_ptr(sc->ip, whichfork))) return 0; /* Set up initial da state. */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 38897adde7b5..5abb5fdb71d9 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -667,7 +667,7 @@ xchk_directory_blocks( { struct xfs_bmbt_irec got; struct xfs_da_args args; - struct xfs_ifork *ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); struct xfs_mount *mp = sc->mp; xfs_fileoff_t leaf_lblk; xfs_fileoff_t free_lblk; diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 3c7506c7553c..21b4c9006859 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -185,7 +185,7 @@ xchk_quota_data_fork( /* Check for data fork problems that apply only to quota files. */ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; - ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) break; diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 599ee277bba2..0215f014e164 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -41,7 +41,7 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 85e1a26c92e8..1ffd5a780182 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -256,7 +256,7 @@ xfs_bmap_count_blocks( xfs_filblks_t *count) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; xfs_extlen_t btblocks = 0; int error; @@ -439,7 +439,7 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; - ifp = XFS_IFORK_PTR(ip, whichfork); + ifp = xfs_ifork_ptr(ip, whichfork); xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index a7174a5b3203..e295fc8062d8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -248,7 +248,7 @@ xfs_dir2_leaf_readbuf( struct xfs_inode *dp = args->dp; struct xfs_buf *bp = NULL; struct xfs_da_geometry *geo = args->geo; - struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); struct xfs_bmbt_irec map; struct blk_plug plug; xfs_dir2_off_t new_off; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2609825d53ee..0d74d8eaff91 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1774,7 +1774,7 @@ xfs_check_delalloc( struct xfs_inode *ip, int whichfork) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 296e253bcfcd..3aeabaf98990 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1293,8 +1293,8 @@ xfs_itruncate_clear_reflink_flags( if (!xfs_is_reflink_inode(ip)) return; - dfork = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK); + dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); + cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); if (dfork->if_bytes == 0 && cfork->if_bytes == 0) ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; if (cfork->if_bytes == 0) @@ -1643,7 +1643,7 @@ xfs_inode_needs_inactive( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); /* * If the inode is already free, then there can be nothing diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7be6f8e705ab..c22b01859051 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -77,6 +77,24 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline struct xfs_ifork * +xfs_ifork_ptr( + struct xfs_inode *ip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return &ip->i_df; + case XFS_ATTR_FORK: + return ip->i_afp; + case XFS_COW_FORK: + return ip->i_cowfp; + default: + ASSERT(0); + return NULL; + } +} + /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f21581cd8a70..1f783e979629 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -991,7 +991,7 @@ xfs_fill_fsxattr( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); fileattr_fill_xflags(fa, xfs_ip2xflags(ip)); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a393259a3a3..ccd97ccd74bf 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -159,7 +159,7 @@ xfs_iomap_eof_align_last_fsb( struct xfs_inode *ip, xfs_fileoff_t end_fsb) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); xfs_extlen_t extsz = xfs_get_extsz_hint(ip); xfs_extlen_t align = xfs_eof_alignment(ip); struct xfs_bmbt_irec irec; @@ -370,7 +370,7 @@ xfs_iomap_prealloc_size( struct xfs_iext_cursor ncur = *icur; struct xfs_bmbt_irec prev, got; struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); int64_t freesp; xfs_fsblock_t qblocks; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index abf08bbf34a9..60eb2d4df144 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1154,7 +1154,7 @@ xfs_qm_dqusage_adjust( ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 81994f4706de..724806c7ce3e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -453,7 +453,7 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t end_fsb, bool cancel_real) { - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); struct xfs_bmbt_irec got, del; struct xfs_iext_cursor icur; int error = 0; @@ -594,7 +594,7 @@ xfs_reflink_end_cow_extent( struct xfs_bmbt_irec got, del, data; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); unsigned int resblks; int nmaps; int error; @@ -1425,7 +1425,7 @@ xfs_reflink_inode_has_shared_extents( bool found; int error; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); error = xfs_iread_extents(tp, ip, XFS_DATA_FORK); if (error) return error; -- cgit v1.2.3 From 2ed5b09b3e8fc274ae8fecd6ab7c5106a364bed1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 9 Jul 2022 10:56:06 -0700 Subject: xfs: make inode attribute forks a permanent part of struct xfs_inode Syzkaller reported a UAF bug a while back: ================================================================== BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127 Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958 CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted 5.15.0-0.30.3-20220406_1406 #3 Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106 print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459 xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127 xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159 xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36 __vfs_getxattr+0xdf/0x13d fs/xattr.c:399 cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300 security_inode_need_killpriv+0x4c/0x97 security/security.c:1408 dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912 dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908 do_truncate+0xc3/0x1e0 fs/open.c:56 handle_truncate fs/namei.c:3084 [inline] do_open fs/namei.c:3432 [inline] path_openat+0x30ab/0x396d fs/namei.c:3561 do_filp_open+0x1c4/0x290 fs/namei.c:3588 do_sys_openat2+0x60d/0x98c fs/open.c:1212 do_sys_open+0xcf/0x13c fs/open.c:1228 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0x0 RIP: 0033:0x7f7ef4bb753d Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48 RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055 RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0 RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0 Allocated by task 2953: kasan_save_stack+0x19/0x38 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:434 [inline] __kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467 kasan_slab_alloc include/linux/kasan.h:254 [inline] slab_post_alloc_hook mm/slab.h:519 [inline] slab_alloc_node mm/slub.c:3213 [inline] slab_alloc mm/slub.c:3221 [inline] kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226 kmem_cache_zalloc include/linux/slab.h:711 [inline] xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287 xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098 xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746 xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59 __vfs_setxattr+0x11b/0x177 fs/xattr.c:180 __vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214 __vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275 vfs_setxattr+0x154/0x33d fs/xattr.c:301 setxattr+0x216/0x29f fs/xattr.c:575 __do_sys_fsetxattr fs/xattr.c:632 [inline] __se_sys_fsetxattr fs/xattr.c:621 [inline] __x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0x0 Freed by task 2949: kasan_save_stack+0x19/0x38 mm/kasan/common.c:38 kasan_set_track+0x1c/0x21 mm/kasan/common.c:46 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free mm/kasan/common.c:328 [inline] __kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374 kasan_slab_free include/linux/kasan.h:230 [inline] slab_free_hook mm/slub.c:1700 [inline] slab_free_freelist_hook mm/slub.c:1726 [inline] slab_free mm/slub.c:3492 [inline] kmem_cache_free+0xdc/0x3ce mm/slub.c:3508 xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773 xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822 xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413 xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684 xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802 xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59 __vfs_removexattr+0x106/0x16a fs/xattr.c:468 cap_inode_killpriv+0x24/0x47 security/commoncap.c:324 security_inode_killpriv+0x54/0xa1 security/security.c:1414 setattr_prepare+0x1a6/0x897 fs/attr.c:146 xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682 xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065 xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093 notify_change+0xae5/0x10a1 fs/attr.c:410 do_truncate+0x134/0x1e0 fs/open.c:64 handle_truncate fs/namei.c:3084 [inline] do_open fs/namei.c:3432 [inline] path_openat+0x30ab/0x396d fs/namei.c:3561 do_filp_open+0x1c4/0x290 fs/namei.c:3588 do_sys_openat2+0x60d/0x98c fs/open.c:1212 do_sys_open+0xcf/0x13c fs/open.c:1228 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0x0 The buggy address belongs to the object at ffff88802cec9188 which belongs to the cache xfs_ifork of size 40 The buggy address is located 20 bytes inside of 40-byte region [ffff88802cec9188, ffff88802cec91b0) The buggy address belongs to the page: page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x2cec9 flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff) raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80 raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc >ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb ^ ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb ================================================================== The root cause of this bug is the unlocked access to xfs_inode.i_afp from the getxattr code paths while trying to determine which ILOCK mode to use to stabilize the xattr data. Unfortunately, the VFS does not acquire i_rwsem when vfs_getxattr (or listxattr) call into the filesystem, which means that getxattr can race with a removexattr that's tearing down the attr fork and crash: xfs_attr_set: xfs_attr_get: xfs_attr_fork_remove: xfs_ilock_attr_map_shared: xfs_idestroy_fork(ip->i_afp); kmem_cache_free(xfs_ifork_cache, ip->i_afp); if (ip->i_afp && ip->i_afp = NULL; xfs_need_iread_extents(ip->i_afp)) ip->i_forkoff = 0; Regrettably, the VFS is much more lax about i_rwsem and getxattr than is immediately obvious -- not only does it not guarantee that we hold i_rwsem, it actually doesn't guarantee that we *don't* hold it either. The getxattr system call won't acquire the lock before calling XFS, but the file capabilities code calls getxattr with and without i_rwsem held to determine if the "security.capabilities" xattr is set on the file. Fixing the VFS locking requires a treewide investigation into every code path that could touch an xattr and what i_rwsem state it expects or sets up. That could take years or even prove impossible; fortunately, we can fix this UAF problem inside XFS. An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to ensure that i_forkoff is always zeroed before i_afp is set to null and changed the read paths to use smp_rmb before accessing i_forkoff and i_afp, which avoided these UAF problems. However, the patch author was too busy dealing with other problems in the meantime, and by the time he came back to this issue, the situation had changed a bit. On a modern system with selinux, each inode will always have at least one xattr for the selinux label, so it doesn't make much sense to keep incurring the extra pointer dereference. Furthermore, Allison's upcoming parent pointer patchset will also cause nearly every inode in the filesystem to have extended attributes. Therefore, make the inode attribute fork structure part of struct xfs_inode, at a cost of 40 more bytes. This patch adds a clunky if_present field where necessary to maintain the existing logic of xattr fork null pointer testing in the existing codebase. The next patch switches the logic over to XFS_IFORK_Q and it all goes away. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 16 +++++++------- fs/xfs/libxfs/xfs_attr.h | 10 ++++----- fs/xfs/libxfs/xfs_attr_leaf.c | 25 ++++++++++----------- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_inode_buf.c | 8 +++---- fs/xfs/libxfs/xfs_inode_fork.c | 44 +++++++++++++++++++++++-------------- fs/xfs/libxfs/xfs_inode_fork.h | 6 +++-- fs/xfs/xfs_attr_inactive.c | 9 ++++---- fs/xfs/xfs_attr_list.c | 10 ++++----- fs/xfs/xfs_bmap_util.c | 8 +++---- fs/xfs/xfs_icache.c | 10 +++++---- fs/xfs/xfs_inode.c | 13 ++++++----- fs/xfs/xfs_inode.h | 6 +++-- fs/xfs/xfs_inode_item.c | 50 +++++++++++++++++++++--------------------- fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_itable.c | 2 +- 16 files changed, 121 insertions(+), 104 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 6b8857e53add..b661dc7600c8 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -69,10 +69,10 @@ xfs_inode_hasattr( { if (!XFS_IFORK_Q(ip)) return 0; - if (!ip->i_afp) + if (!ip->i_af.if_present) return 0; - if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0) + if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0) return 0; return 1; } @@ -85,7 +85,7 @@ bool xfs_attr_is_leaf( struct xfs_inode *ip) { - struct xfs_ifork *ifp = ip->i_afp; + struct xfs_ifork *ifp = &ip->i_af; struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap; @@ -231,7 +231,7 @@ xfs_attr_get_ilocked( if (!xfs_inode_hasattr(args->dp)) return -ENOATTR; - if (args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_getvalue(args); if (xfs_attr_is_leaf(args->dp)) return xfs_attr_leaf_get(args); @@ -354,7 +354,7 @@ xfs_attr_try_sf_addname( /* * Build initial attribute list (if required). */ - if (dp->i_afp->if_format == XFS_DINODE_FMT_EXTENTS) + if (dp->i_af.if_format == XFS_DINODE_FMT_EXTENTS) xfs_attr_shortform_create(args); error = xfs_attr_shortform_addname(args); @@ -864,7 +864,7 @@ xfs_attr_lookup( if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_sf_findname(args, NULL, NULL); if (xfs_attr_is_leaf(dp)) { @@ -1101,7 +1101,7 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { struct xfs_attr_shortform *sf; - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; return be16_to_cpu(sf->hdr.totsize); } diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index dfb47fa63c6d..0f100db504ee 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -560,9 +560,9 @@ static inline bool xfs_attr_is_shortform( struct xfs_inode *ip) { - return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL || - (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS && - ip->i_afp->if_nextents == 0); + return ip->i_af.if_format == XFS_DINODE_FMT_LOCAL || + (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && + ip->i_af.if_nextents == 0); } static inline enum xfs_delattr_state @@ -573,10 +573,10 @@ xfs_attr_init_add_state(struct xfs_da_args *args) * next state, the attribute fork may be null. This can occur only occur * on a pure remove, but we grab the next state before we check if a * replace operation is being performed. If we are called from any other - * context, i_afp is guaranteed to exist. Hence if the attr fork is + * context, i_af is guaranteed to exist. Hence if the attr fork is * null, we were called from a pure remove operation and so we are done. */ - if (!args->dp->i_afp) + if (!args->dp->i_af.if_present) return XFS_DAS_DONE; args->op_flags |= XFS_DA_OP_ADDNAME; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index d3670877143f..435c6cc485bf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -682,7 +682,7 @@ xfs_attr_shortform_create( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; - struct xfs_ifork *ifp = dp->i_afp; + struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_sf_hdr *hdr; trace_xfs_attr_sf_create(args); @@ -719,7 +719,7 @@ xfs_attr_sf_findname( int end; int i; - sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), @@ -764,7 +764,7 @@ xfs_attr_shortform_add( mp = dp->i_mount; dp->i_forkoff = forkoff; - ifp = dp->i_afp; + ifp = &dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) @@ -797,11 +797,10 @@ xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { - ASSERT(ip->i_afp->if_nextents == 0); + ASSERT(ip->i_af.if_nextents == 0); - xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - ip->i_afp = NULL; + xfs_idestroy_fork(&ip->i_af); + xfs_ifork_zap_attr(ip); ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -825,7 +824,7 @@ xfs_attr_sf_removename( dp = args->dp; mp = dp->i_mount; - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; error = xfs_attr_sf_findname(args, &sfe, &base); @@ -889,7 +888,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) trace_xfs_attr_sf_lookup(args); - ifp = args->dp->i_afp; + ifp = &args->dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; @@ -917,8 +916,8 @@ xfs_attr_shortform_getvalue( struct xfs_attr_sf_entry *sfe; int i; - ASSERT(args->dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)args->dp->i_afp->if_u1.if_data; + ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); + sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = xfs_attr_sf_nextentry(sfe), i++) { @@ -948,7 +947,7 @@ xfs_attr_shortform_to_leaf( trace_xfs_attr_sf_to_leaf(args); dp = args->dp; - ifp = dp->i_afp; + ifp = &dp->i_af; sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); @@ -1055,7 +1054,7 @@ xfs_attr_shortform_verify( int i; int64_t size; - ASSERT(ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = ifp->if_bytes; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 3569a67562ce..32dc74a04cc8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1041,9 +1041,9 @@ xfs_bmap_add_attrfork( error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) goto trans_cancel; - ASSERT(ip->i_afp == NULL); + ASSERT(!ip->i_af.if_present); - ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; switch (ip->i_df.if_format) { case XFS_DINODE_FMT_LOCAL: diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 3a12bd3c7c97..dd11df5d3bf8 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -177,7 +177,7 @@ xfs_inode_from_disk( xfs_failaddr_t fa; ASSERT(ip->i_cowfp == NULL); - ASSERT(ip->i_afp == NULL); + ASSERT(!ip->i_af.if_present); fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); if (fa) { @@ -285,7 +285,7 @@ xfs_inode_to_disk_iext_counters( { if (xfs_inode_has_large_extent_counts(ip)) { to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df)); - to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp)); + to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_af)); /* * We might be upgrading the inode to use larger extent counters * than was previously used. Hence zero the unused field. @@ -293,7 +293,7 @@ xfs_inode_to_disk_iext_counters( to->di_nrext64_pad = cpu_to_be16(0); } else { to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df)); - to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp)); + to->di_anextents = cpu_to_be16(xfs_ifork_nextents(&ip->i_af)); } } @@ -325,7 +325,7 @@ xfs_inode_to_disk( to->di_nblocks = cpu_to_be64(ip->i_nblocks); to->di_extsize = cpu_to_be32(ip->i_extsize); to->di_forkoff = ip->i_forkoff; - to->di_aformat = xfs_ifork_format(ip->i_afp); + to->di_aformat = xfs_ifork_format(&ip->i_af); to->di_flags = cpu_to_be16(ip->i_diflags); if (xfs_has_v3inodes(ip->i_mount)) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 40016b8b00ee..9d5141c21eee 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -276,17 +276,30 @@ xfs_dfork_attr_shortform_size( return be16_to_cpu(atp->hdr.totsize); } -struct xfs_ifork * -xfs_ifork_alloc( +void +xfs_ifork_init_attr( + struct xfs_inode *ip, enum xfs_dinode_fmt format, xfs_extnum_t nextents) { - struct xfs_ifork *ifp; + ASSERT(!ip->i_af.if_present); - ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL); - ifp->if_format = format; - ifp->if_nextents = nextents; - return ifp; + ip->i_af.if_present = 1; + ip->i_af.if_format = format; + ip->i_af.if_nextents = nextents; +} + +void +xfs_ifork_zap_attr( + struct xfs_inode *ip) +{ + ASSERT(ip->i_af.if_present); + ASSERT(ip->i_af.if_broot == NULL); + ASSERT(ip->i_af.if_u1.if_data == NULL); + ASSERT(ip->i_af.if_height == 0); + + memset(&ip->i_af, 0, sizeof(struct xfs_ifork)); + ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; } int @@ -301,9 +314,9 @@ xfs_iformat_attr_fork( * Initialize the extent count early, as the per-format routines may * depend on it. */ - ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents); + xfs_ifork_init_attr(ip, dip->di_aformat, naextents); - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_LOCAL: error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, xfs_dfork_attr_shortform_size(dip)); @@ -323,10 +336,8 @@ xfs_iformat_attr_fork( break; } - if (error) { - kmem_cache_free(xfs_ifork_cache, ip->i_afp); - ip->i_afp = NULL; - } + if (error) + xfs_ifork_zap_attr(ip); return error; } @@ -656,7 +667,7 @@ xfs_iext_state_to_fork( if (state & BMAP_COWFORK) return ip->i_cowfp; else if (state & BMAP_ATTRFORK) - return ip->i_afp; + return &ip->i_af; return &ip->i_df; } @@ -672,6 +683,7 @@ xfs_ifork_init_cow( ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL); + ip->i_cowfp->if_present = 1; ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } @@ -707,10 +719,10 @@ int xfs_ifork_verify_local_attr( struct xfs_inode *ip) { - struct xfs_ifork *ifp = ip->i_afp; + struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!ifp) + if (!ifp->if_present) fa = __this_address; else fa = xfs_attr_shortform_verify(ip); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 50c5fae4e35d..63015e9cee14 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -24,6 +24,7 @@ struct xfs_ifork { xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ + int8_t if_present; /* 1 if present */ }; /* @@ -173,8 +174,9 @@ xfs_dfork_nextents( return 0; } -struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format, - xfs_extnum_t nextents); +void xfs_ifork_zap_attr(struct xfs_inode *ip); +void xfs_ifork_init_attr(struct xfs_inode *ip, enum xfs_dinode_fmt format, + xfs_extnum_t nextents); struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); int xfs_iformat_data_fork(struct xfs_inode *, struct xfs_dinode *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 27265771f247..dbe715fe92ce 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -367,7 +367,7 @@ xfs_attr_inactive( * removal below. */ if (xfs_inode_hasattr(dp) && - dp->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + dp->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -388,10 +388,9 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - if (dp->i_afp) { - xfs_idestroy_fork(dp->i_afp); - kmem_cache_free(xfs_ifork_cache, dp->i_afp); - dp->i_afp = NULL; + if (dp->i_af.if_present) { + xfs_idestroy_fork(&dp->i_af); + xfs_ifork_zap_attr(dp); } if (lock_mode) xfs_iunlock(dp, lock_mode); diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 90a14e85e76d..6a832ee07916 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -61,8 +61,8 @@ xfs_attr_shortform_list( int sbsize, nsbuf, count, i; int error = 0; - ASSERT(dp->i_afp != NULL); - sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data; + ASSERT(dp->i_af.if_present); + sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; @@ -80,7 +80,7 @@ xfs_attr_shortform_list( */ if (context->bufsize == 0 || (XFS_ISRESET_CURSOR(cursor) && - (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, @@ -121,7 +121,7 @@ xfs_attr_shortform_list( for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_ERRLEVEL_LOW, context->dp->i_mount, sfe, @@ -513,7 +513,7 @@ xfs_attr_list_ilocked( */ if (!xfs_inode_hasattr(dp)) return 0; - if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL) + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) return xfs_attr_shortform_list(context); if (xfs_attr_is_leaf(dp)) return xfs_attr_leaf_list(context); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1ffd5a780182..f317586d2f63 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1506,15 +1506,15 @@ xfs_swap_extent_forks( /* * Count the number of extended attribute blocks */ - if (XFS_IFORK_Q(ip) && ip->i_afp->if_nextents > 0 && - ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (XFS_IFORK_Q(ip) && ip->i_af.if_nextents > 0 && + ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } - if (XFS_IFORK_Q(tip) && tip->i_afp->if_nextents > 0 && - tip->i_afp->if_format != XFS_DINODE_FMT_LOCAL) { + if (XFS_IFORK_Q(tip) && tip->i_af.if_nextents > 0 && + tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); if (error) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0d74d8eaff91..e08dedfb7f9d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -98,9 +98,11 @@ xfs_inode_alloc( ip->i_ino = ino; ip->i_mount = mp; memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); - ip->i_afp = NULL; ip->i_cowfp = NULL; + memset(&ip->i_af, 0, sizeof(ip->i_af)); + ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(ip->i_df)); + ip->i_df.if_present = 1; ip->i_flags = 0; ip->i_delayed_blks = 0; ip->i_diflags2 = mp->m_ino_geo.new_diflags2; @@ -130,9 +132,9 @@ xfs_inode_free_callback( break; } - if (ip->i_afp) { - xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_cache, ip->i_afp); + if (ip->i_af.if_present) { + xfs_idestroy_fork(&ip->i_af); + xfs_ifork_zap_attr(ip); } if (ip->i_cowfp) { xfs_idestroy_fork(ip->i_cowfp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3aeabaf98990..f5e553168b42 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -125,7 +125,7 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_afp && xfs_need_iread_extents(ip->i_afp)) + if (ip->i_af.if_present && xfs_need_iread_extents(&ip->i_af)) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); return lock_mode; @@ -893,7 +893,7 @@ xfs_init_new_inode( */ if (init_xattrs && xfs_has_attr(mp)) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; - ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); + xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); } /* @@ -1768,7 +1768,7 @@ xfs_inactive( goto out; } - ASSERT(!ip->i_afp); + ASSERT(!ip->i_af.if_present); ASSERT(ip->i_forkoff == 0); /* @@ -3466,13 +3466,13 @@ xfs_iflush( goto flush_out; } } - if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) > + if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { xfs_alert_tag(mp, XFS_PTAG_IFLUSH, "%s: detected corrupt incore inode %llu, " "total extents = %llu nblocks = %lld, ptr "PTR_FMT, __func__, ip->i_ino, - ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp), + ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), ip->i_nblocks, ip); goto flush_out; } @@ -3502,7 +3502,8 @@ xfs_iflush( if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_data(ip)) goto flush_out; - if (ip->i_afp && ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL && + if (ip->i_af.if_present && + ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_attr(ip)) goto flush_out; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c22b01859051..045bad62ac25 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -33,9 +33,9 @@ typedef struct xfs_inode { struct xfs_imap i_imap; /* location for xfs_imap() */ /* Extent information. */ - struct xfs_ifork *i_afp; /* attribute fork pointer */ struct xfs_ifork *i_cowfp; /* copy on write extents */ struct xfs_ifork i_df; /* data fork */ + struct xfs_ifork i_af; /* attribute fork */ /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ @@ -86,7 +86,9 @@ xfs_ifork_ptr( case XFS_DATA_FORK: return &ip->i_df; case XFS_ATTR_FORK: - return ip->i_afp; + if (!ip->i_af.if_present) + return NULL; + return &ip->i_af; case XFS_COW_FORK: return ip->i_cowfp; default: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 721def0639fd..77519e6f0b34 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -92,11 +92,11 @@ xfs_inode_item_attr_fork_size( { struct xfs_inode *ip = iip->ili_inode; - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_EXTENTS: if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_afp->if_nextents > 0 && - ip->i_afp->if_bytes > 0) { + ip->i_af.if_nextents > 0 && + ip->i_af.if_bytes > 0) { /* worst case, doesn't subtract unused space */ *nbytes += XFS_IFORK_ASIZE(ip); *nvecs += 1; @@ -104,15 +104,15 @@ xfs_inode_item_attr_fork_size( break; case XFS_DINODE_FMT_BTREE: if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) { - *nbytes += ip->i_afp->if_broot_bytes; + ip->i_af.if_broot_bytes > 0) { + *nbytes += ip->i_af.if_broot_bytes; *nvecs += 1; } break; case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) { - *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes); + ip->i_af.if_bytes > 0) { + *nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes); *nvecs += 1; } break; @@ -237,18 +237,18 @@ xfs_inode_item_format_attr_fork( struct xfs_inode *ip = iip->ili_inode; size_t data_bytes; - switch (ip->i_afp->if_format) { + switch (ip->i_af.if_format) { case XFS_DINODE_FMT_EXTENTS: iip->ili_fields &= ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_AEXT) && - ip->i_afp->if_nextents > 0 && - ip->i_afp->if_bytes > 0) { + ip->i_af.if_nextents > 0 && + ip->i_af.if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(xfs_iext_count(ip->i_afp) == - ip->i_afp->if_nextents); + ASSERT(xfs_iext_count(&ip->i_af) == + ip->i_af.if_nextents); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); @@ -265,13 +265,13 @@ xfs_inode_item_format_attr_fork( ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); if ((iip->ili_fields & XFS_ILOG_ABROOT) && - ip->i_afp->if_broot_bytes > 0) { - ASSERT(ip->i_afp->if_broot != NULL); + ip->i_af.if_broot_bytes > 0) { + ASSERT(ip->i_af.if_broot != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, - ip->i_afp->if_broot, - ip->i_afp->if_broot_bytes); - ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ip->i_af.if_broot, + ip->i_af.if_broot_bytes); + ilf->ilf_asize = ip->i_af.if_broot_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ABROOT; @@ -282,12 +282,12 @@ xfs_inode_item_format_attr_fork( ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); if ((iip->ili_fields & XFS_ILOG_ADATA) && - ip->i_afp->if_bytes > 0) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); + ip->i_af.if_bytes > 0) { + ASSERT(ip->i_af.if_u1.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_afp->if_u1.if_data, - ip->i_afp->if_bytes); - ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes; + ip->i_af.if_u1.if_data, + ip->i_af.if_bytes); + ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { iip->ili_fields &= ~XFS_ILOG_ADATA; @@ -355,11 +355,11 @@ xfs_inode_to_log_dinode_iext_counters( { if (xfs_inode_has_large_extent_counts(ip)) { to->di_big_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_big_anextents = xfs_ifork_nextents(ip->i_afp); + to->di_big_anextents = xfs_ifork_nextents(&ip->i_af); to->di_nrext64_pad = 0; } else { to->di_nextents = xfs_ifork_nextents(&ip->i_df); - to->di_anextents = xfs_ifork_nextents(ip->i_afp); + to->di_anextents = xfs_ifork_nextents(&ip->i_af); } } @@ -390,7 +390,7 @@ xfs_inode_to_log_dinode( to->di_nblocks = ip->i_nblocks; to->di_extsize = ip->i_extsize; to->di_forkoff = ip->i_forkoff; - to->di_aformat = xfs_ifork_format(ip->i_afp); + to->di_aformat = xfs_ifork_format(&ip->i_af); to->di_flags = ip->i_diflags; xfs_copy_dm_fields_to_log_dinode(ip, to); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ccd97ccd74bf..eb54fd259d15 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1307,12 +1307,12 @@ xfs_xattr_iomap_begin( lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (!XFS_IFORK_Q(ip) || !ip->i_afp->if_nextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_af.if_nextents) { error = -ENOENT; goto out_unlock; } - ASSERT(ip->i_afp->if_format != XFS_DINODE_FMT_LOCAL); + ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f74c9fff72bb..7d1ff282953f 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -111,7 +111,7 @@ xfs_bulkstat_one_int( buf->bs_extents64 = nextents; xfs_bulkstat_health(ip, buf); - buf->bs_aextents = xfs_ifork_nextents(ip->i_afp); + buf->bs_aextents = xfs_ifork_nextents(&ip->i_af); buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; -- cgit v1.2.3 From e45d7cb2356e6b59fe64da28324025cc6fcd3fbd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 9 Jul 2022 10:56:06 -0700 Subject: xfs: use XFS_IFORK_Q to determine the presence of an xattr fork Modify xfs_ifork_ptr to return a NULL pointer if the caller asks for the attribute fork but i_forkoff is zero. This eliminates the ambiguity between i_forkoff and i_af.if_present, which should make it easier to understand the lifetime of attr forks. While we're at it, remove the if_present checks around calls to xfs_idestroy_fork and xfs_ifork_zap_attr since they can both handle attr forks that have already been torn down. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 2 -- fs/xfs/libxfs/xfs_attr.h | 2 +- fs/xfs/libxfs/xfs_bmap.c | 1 - fs/xfs/libxfs/xfs_inode_buf.c | 1 - fs/xfs/libxfs/xfs_inode_fork.c | 7 +------ fs/xfs/libxfs/xfs_inode_fork.h | 1 - fs/xfs/xfs_attr_inactive.c | 11 ++++------- fs/xfs/xfs_attr_list.c | 1 - fs/xfs/xfs_icache.c | 8 +++----- fs/xfs/xfs_inode.c | 5 ++--- fs/xfs/xfs_inode.h | 2 +- 11 files changed, 12 insertions(+), 29 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index b661dc7600c8..8721d012e983 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -69,8 +69,6 @@ xfs_inode_hasattr( { if (!XFS_IFORK_Q(ip)) return 0; - if (!ip->i_af.if_present) - return 0; if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && ip->i_af.if_nextents == 0) return 0; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 0f100db504ee..36371c3b9069 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -576,7 +576,7 @@ xfs_attr_init_add_state(struct xfs_da_args *args) * context, i_af is guaranteed to exist. Hence if the attr fork is * null, we were called from a pure remove operation and so we are done. */ - if (!args->dp->i_af.if_present) + if (!XFS_IFORK_Q(args->dp)) return XFS_DAS_DONE; args->op_flags |= XFS_DA_OP_ADDNAME; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 32dc74a04cc8..1ef72443025a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1041,7 +1041,6 @@ xfs_bmap_add_attrfork( error = xfs_bmap_set_attrforkoff(ip, size, &version); if (error) goto trans_cancel; - ASSERT(!ip->i_af.if_present); xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); logflags = 0; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index dd11df5d3bf8..2f742a1b7c0a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -177,7 +177,6 @@ xfs_inode_from_disk( xfs_failaddr_t fa; ASSERT(ip->i_cowfp == NULL); - ASSERT(!ip->i_af.if_present); fa = xfs_dinode_verify(ip->i_mount, ip->i_ino, from); if (fa) { diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 9d5141c21eee..b0370b837166 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -282,9 +282,6 @@ xfs_ifork_init_attr( enum xfs_dinode_fmt format, xfs_extnum_t nextents) { - ASSERT(!ip->i_af.if_present); - - ip->i_af.if_present = 1; ip->i_af.if_format = format; ip->i_af.if_nextents = nextents; } @@ -293,7 +290,6 @@ void xfs_ifork_zap_attr( struct xfs_inode *ip) { - ASSERT(ip->i_af.if_present); ASSERT(ip->i_af.if_broot == NULL); ASSERT(ip->i_af.if_u1.if_data == NULL); ASSERT(ip->i_af.if_height == 0); @@ -683,7 +679,6 @@ xfs_ifork_init_cow( ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL); - ip->i_cowfp->if_present = 1; ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } @@ -722,7 +717,7 @@ xfs_ifork_verify_local_attr( struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!ifp->if_present) + if (!XFS_IFORK_Q(ip)) fa = __this_address; else fa = xfs_attr_shortform_verify(ip); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 63015e9cee14..0b912bbe4f4b 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -24,7 +24,6 @@ struct xfs_ifork { xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ - int8_t if_present; /* 1 if present */ }; /* diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index dbe715fe92ce..ec20ad7a9001 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -362,12 +362,11 @@ xfs_attr_inactive( /* * Invalidate and truncate the attribute fork extents. Make sure the - * fork actually has attributes as otherwise the invalidation has no + * fork actually has xattr blocks as otherwise the invalidation has no * blocks to read and returns an error. In this case, just do the fork * removal below. */ - if (xfs_inode_hasattr(dp) && - dp->i_af.if_format != XFS_DINODE_FMT_LOCAL) { + if (dp->i_af.if_nextents > 0) { error = xfs_attr3_root_inactive(&trans, dp); if (error) goto out_cancel; @@ -388,10 +387,8 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - if (dp->i_af.if_present) { - xfs_idestroy_fork(&dp->i_af); - xfs_ifork_zap_attr(dp); - } + xfs_idestroy_fork(&dp->i_af); + xfs_ifork_zap_attr(dp); if (lock_mode) xfs_iunlock(dp, lock_mode); return error; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 6a832ee07916..99bbbe1a0e44 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -61,7 +61,6 @@ xfs_attr_shortform_list( int sbsize, nsbuf, count, i; int error = 0; - ASSERT(dp->i_af.if_present); sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e08dedfb7f9d..026c63234f8d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -102,7 +102,6 @@ xfs_inode_alloc( memset(&ip->i_af, 0, sizeof(ip->i_af)); ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; memset(&ip->i_df, 0, sizeof(ip->i_df)); - ip->i_df.if_present = 1; ip->i_flags = 0; ip->i_delayed_blks = 0; ip->i_diflags2 = mp->m_ino_geo.new_diflags2; @@ -132,10 +131,9 @@ xfs_inode_free_callback( break; } - if (ip->i_af.if_present) { - xfs_idestroy_fork(&ip->i_af); - xfs_ifork_zap_attr(ip); - } + xfs_idestroy_fork(&ip->i_af); + xfs_ifork_zap_attr(ip); + if (ip->i_cowfp) { xfs_idestroy_fork(ip->i_cowfp); kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f5e553168b42..699c44f57707 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -125,7 +125,7 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (ip->i_af.if_present && xfs_need_iread_extents(&ip->i_af)) + if (XFS_IFORK_Q(ip) && xfs_need_iread_extents(&ip->i_af)) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); return lock_mode; @@ -1768,7 +1768,6 @@ xfs_inactive( goto out; } - ASSERT(!ip->i_af.if_present); ASSERT(ip->i_forkoff == 0); /* @@ -3502,7 +3501,7 @@ xfs_iflush( if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_data(ip)) goto flush_out; - if (ip->i_af.if_present && + if (XFS_IFORK_Q(ip) && ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_attr(ip)) goto flush_out; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 045bad62ac25..9bda01311c2f 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -86,7 +86,7 @@ xfs_ifork_ptr( case XFS_DATA_FORK: return &ip->i_df; case XFS_ATTR_FORK: - if (!ip->i_af.if_present) + if (!XFS_IFORK_Q(ip)) return NULL; return &ip->i_af; case XFS_COW_FORK: -- cgit v1.2.3 From 932b42c66cb5d0ca9800b128415b4ad6b1952b3e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 9 Jul 2022 10:56:06 -0700 Subject: xfs: replace XFS_IFORK_Q with a proper predicate function Replace this shouty macro with a real C function that has a more descriptive name. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr.c | 4 ++-- fs/xfs/libxfs/xfs_attr.h | 2 +- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_inode_fork.c | 2 +- fs/xfs/libxfs/xfs_inode_fork.h | 5 ++--- fs/xfs/scrub/btree.c | 2 +- fs/xfs/xfs_attr_inactive.c | 4 ++-- fs/xfs/xfs_bmap_util.c | 10 +++++----- fs/xfs/xfs_inode.c | 10 +++++----- fs/xfs/xfs_inode.h | 7 ++++++- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_iomap.c | 2 +- fs/xfs/xfs_iops.c | 2 +- 13 files changed, 31 insertions(+), 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 8721d012e983..e28d93d232de 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -67,7 +67,7 @@ int xfs_inode_hasattr( struct xfs_inode *ip) { - if (!XFS_IFORK_Q(ip)) + if (!xfs_inode_has_attr_fork(ip)) return 0; if (ip->i_af.if_format == XFS_DINODE_FMT_EXTENTS && ip->i_af.if_nextents == 0) @@ -999,7 +999,7 @@ xfs_attr_set( * If the inode doesn't have an attribute fork, add one. * (inode must not be locked when we call this routine) */ - if (XFS_IFORK_Q(dp) == 0) { + if (xfs_inode_has_attr_fork(dp) == 0) { int sf_size = sizeof(struct xfs_attr_sf_hdr) + xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 36371c3b9069..81be9b3e4004 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -576,7 +576,7 @@ xfs_attr_init_add_state(struct xfs_da_args *args) * context, i_af is guaranteed to exist. Hence if the attr fork is * null, we were called from a pure remove operation and so we are done. */ - if (!XFS_IFORK_Q(args->dp)) + if (!xfs_inode_has_attr_fork(args->dp)) return XFS_DAS_DONE; args->op_flags |= XFS_DA_OP_ADDNAME; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1ef72443025a..e5108df54f5a 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1023,7 +1023,7 @@ xfs_bmap_add_attrfork( int logflags; /* logging flags */ int error; /* error return value */ - ASSERT(XFS_IFORK_Q(ip) == 0); + ASSERT(xfs_inode_has_attr_fork(ip) == 0); mp = ip->i_mount; ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); @@ -1034,7 +1034,7 @@ xfs_bmap_add_attrfork( rsvd, &tp); if (error) return error; - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) goto trans_cancel; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index b0370b837166..b3ba97242e71 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -717,7 +717,7 @@ xfs_ifork_verify_local_attr( struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!XFS_IFORK_Q(ip)) + if (!xfs_inode_has_attr_fork(ip)) fa = __this_address; else fa = xfs_attr_shortform_verify(ip); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 0b912bbe4f4b..efee11956540 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -78,13 +78,12 @@ struct xfs_ifork { * Fork handling. */ -#define XFS_IFORK_Q(ip) ((ip)->i_forkoff != 0) #define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3)) #define XFS_IFORK_DSIZE(ip) \ - (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) + (xfs_inode_has_attr_fork(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) #define XFS_IFORK_ASIZE(ip) \ - (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) + (xfs_inode_has_attr_fork(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) #define XFS_IFORK_SIZE(ip,w) \ ((w) == XFS_DATA_FORK ? \ XFS_IFORK_DSIZE(ip) : \ diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 39dd46f038fe..2f4519590dc1 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -462,7 +462,7 @@ xchk_btree_check_iroot_minrecs( */ if (bs->cur->bc_btnum == XFS_BTNUM_BMAP && bs->cur->bc_ino.whichfork == XFS_DATA_FORK && - XFS_IFORK_Q(bs->sc->ip)) + xfs_inode_has_attr_fork(bs->sc->ip)) return false; return true; diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index ec20ad7a9001..0e83cab9cdde 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -338,7 +338,7 @@ xfs_attr_inactive( ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, lock_mode); - if (!XFS_IFORK_Q(dp)) + if (!xfs_inode_has_attr_fork(dp)) goto out_destroy_fork; xfs_iunlock(dp, lock_mode); @@ -351,7 +351,7 @@ xfs_attr_inactive( lock_mode = XFS_ILOCK_EXCL; xfs_ilock(dp, lock_mode); - if (!XFS_IFORK_Q(dp)) + if (!xfs_inode_has_attr_fork(dp)) goto out_cancel; /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index f317586d2f63..8111319cbb46 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -444,7 +444,7 @@ xfs_getbmap( xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: - if (!XFS_IFORK_Q(ip)) + if (!xfs_inode_has_attr_fork(ip)) goto out_unlock_iolock; max_len = 1LL << 32; @@ -1320,7 +1320,7 @@ xfs_swap_extents_check_format( * extent format... */ if (tifp->if_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_Q(ip) && + if (xfs_inode_has_attr_fork(ip) && XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip)) return -EINVAL; if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) @@ -1329,7 +1329,7 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ifp->if_format == XFS_DINODE_FMT_BTREE) { - if (XFS_IFORK_Q(tip) && + if (xfs_inode_has_attr_fork(tip) && XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) return -EINVAL; if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) @@ -1506,14 +1506,14 @@ xfs_swap_extent_forks( /* * Count the number of extended attribute blocks */ - if (XFS_IFORK_Q(ip) && ip->i_af.if_nextents > 0 && + if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_nextents > 0 && ip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &junk, &aforkblks); if (error) return error; } - if (XFS_IFORK_Q(tip) && tip->i_af.if_nextents > 0 && + if (xfs_inode_has_attr_fork(tip) && tip->i_af.if_nextents > 0 && tip->i_af.if_format != XFS_DINODE_FMT_LOCAL) { error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, &junk, &taforkblks); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 699c44f57707..33cf0b82a0c0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -125,7 +125,7 @@ xfs_ilock_attr_map_shared( { uint lock_mode = XFS_ILOCK_SHARED; - if (XFS_IFORK_Q(ip) && xfs_need_iread_extents(&ip->i_af)) + if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) lock_mode = XFS_ILOCK_EXCL; xfs_ilock(ip, lock_mode); return lock_mode; @@ -635,7 +635,7 @@ xfs_ip2xflags( flags |= FS_XFLAG_COWEXTSIZE; } - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) flags |= FS_XFLAG_HASATTR; return flags; } @@ -1762,7 +1762,7 @@ xfs_inactive( * now. The code calls a routine that recursively deconstructs the * attribute fork. If also blows away the in-core attribute fork. */ - if (XFS_IFORK_Q(ip)) { + if (xfs_inode_has_attr_fork(ip)) { error = xfs_attr_inactive(ip); if (error) goto out; @@ -3501,7 +3501,7 @@ xfs_iflush( if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_data(ip)) goto flush_out; - if (XFS_IFORK_Q(ip) && + if (xfs_inode_has_attr_fork(ip) && ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && xfs_ifork_verify_local_attr(ip)) goto flush_out; @@ -3520,7 +3520,7 @@ xfs_iflush( } xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 9bda01311c2f..2badbf9bb80d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -77,6 +77,11 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) +{ + return ip->i_forkoff > 0; +} + static inline struct xfs_ifork * xfs_ifork_ptr( struct xfs_inode *ip, @@ -86,7 +91,7 @@ xfs_ifork_ptr( case XFS_DATA_FORK: return &ip->i_df; case XFS_ATTR_FORK: - if (!XFS_IFORK_Q(ip)) + if (!xfs_inode_has_attr_fork(ip)) return NULL; return &ip->i_af; case XFS_COW_FORK: diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 77519e6f0b34..a78a0b9dd1d0 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -143,7 +143,7 @@ xfs_inode_item_size( xfs_log_dinode_size(ip->i_mount); xfs_inode_item_data_fork_size(iip, nvecs, nbytes); - if (XFS_IFORK_Q(ip)) + if (xfs_inode_has_attr_fork(ip)) xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); } @@ -480,7 +480,7 @@ xfs_inode_item_format( xfs_inode_item_format_core(ip, lv, &vecp); xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); - if (XFS_IFORK_Q(ip)) { + if (xfs_inode_has_attr_fork(ip)) { xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); } else { iip->ili_fields &= diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index eb54fd259d15..edbfd6abcc15 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1307,7 +1307,7 @@ xfs_xattr_iomap_begin( lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (!XFS_IFORK_Q(ip) || !ip->i_af.if_nextents) { + if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) { error = -ENOENT; goto out_unlock; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 6720b60f88cf..14efa0fc1c19 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1279,7 +1279,7 @@ xfs_setup_inode( * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. */ - if (!XFS_IFORK_Q(ip)) { + if (!xfs_inode_has_attr_fork(ip)) { inode_has_no_xattr(inode); cache_no_acl(inode); } -- cgit v1.2.3 From c01147d929899f02a0a8b15e406d12784768ca72 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sat, 9 Jul 2022 10:56:07 -0700 Subject: xfs: replace inode fork size macros with functions Replace the shouty macros here with typechecked helper functions. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 6 +++--- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_dir2.c | 2 +- fs/xfs/libxfs/xfs_dir2_block.c | 4 ++-- fs/xfs/libxfs/xfs_dir2_sf.c | 6 +++--- fs/xfs/libxfs/xfs_inode_fork.c | 10 +++++----- fs/xfs/libxfs/xfs_inode_fork.h | 15 +-------------- fs/xfs/scrub/symlink.c | 4 ++-- fs/xfs/xfs_bmap_util.c | 4 ++-- fs/xfs/xfs_inode.h | 35 +++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_symlink.c | 2 +- fs/xfs/xfs_trace.h | 2 +- 15 files changed, 61 insertions(+), 39 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 435c6cc485bf..5bd554b88d99 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -590,7 +590,7 @@ xfs_attr_shortform_bytesfit( * to real extents, or the delalloc conversion will take care of the * literal area rebalancing. */ - if (bytes <= XFS_IFORK_ASIZE(dp)) + if (bytes <= xfs_inode_attr_fork_size(dp)) return dp->i_forkoff; /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e5108df54f5a..e56723dc9cd5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -880,7 +880,7 @@ xfs_bmap_add_attrfork_btree( mp = ip->i_mount; - if (XFS_BMAP_BMDR_SPACE(block) <= XFS_IFORK_DSIZE(ip)) + if (XFS_BMAP_BMDR_SPACE(block) <= xfs_inode_data_fork_size(ip)) *flags |= XFS_ILOG_DBROOT; else { cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); @@ -920,7 +920,7 @@ xfs_bmap_add_attrfork_extents( int error; /* error return value */ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= - XFS_IFORK_DSIZE(ip)) + xfs_inode_data_fork_size(ip)) return 0; cur = NULL; error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags, @@ -951,7 +951,7 @@ xfs_bmap_add_attrfork_local( { struct xfs_da_args dargs; /* args for dir/attr code */ - if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) + if (ip->i_df.if_bytes <= xfs_inode_data_fork_size(ip)) return 0; if (S_ISDIR(VFS_I(ip)->i_mode)) { diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 986ffca7f74d..cfa052d40105 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -564,7 +564,7 @@ xfs_bmbt_init_cursor( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 3cd51fa3837b..76eedc2756b3 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -193,7 +193,7 @@ xfs_dir_isempty( ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); if (dp->i_disk_size == 0) /* might happen during shutdown. */ return 1; - if (dp->i_disk_size > XFS_IFORK_DSIZE(dp)) + if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; return !sfp->count; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index f5e45aa28c91..00f960a703b2 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -842,7 +842,7 @@ xfs_dir2_block_removename( * See if the size as a shortform is good enough. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; /* @@ -1055,7 +1055,7 @@ xfs_dir2_leaf_to_block( * Now see if the resulting block can be shrunken to shortform. */ size = xfs_dir2_block_sfsize(dp, hdr, &sfh); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return 0; return xfs_dir2_block_to_sf(args, dbp, size, &sfh); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index a41e2624e115..003812fd7d35 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -237,7 +237,7 @@ xfs_dir2_block_sfsize( (i8count ? /* inumber */ count * XFS_INO64_SIZE : count * XFS_INO32_SIZE); - if (size > XFS_IFORK_DSIZE(dp)) + if (size > xfs_inode_data_fork_size(dp)) return size; /* size value is a failure */ } /* @@ -406,7 +406,7 @@ xfs_dir2_sf_addname( * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). */ - if (new_isize > XFS_IFORK_DSIZE(dp) || + if (new_isize > xfs_inode_data_fork_size(dp) || (pick = xfs_dir2_sf_addname_pick(args, objchange, &sfep, &offset)) == 0) { /* @@ -1031,7 +1031,7 @@ xfs_dir2_sf_replace_needblock( newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && - sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp); + sfp->i8count == 0 && newsize > xfs_inode_data_fork_size(dp); } /* diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index b3ba97242e71..7c34184448e6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -407,7 +407,7 @@ xfs_iroot_realloc( (int)new_size); ifp->if_broot_bytes = (int)new_size; ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t)); return; } @@ -461,7 +461,7 @@ xfs_iroot_realloc( ifp->if_broot_bytes = (int)new_size; if (ifp->if_broot) ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); return; } @@ -491,7 +491,7 @@ xfs_idata_realloc( int64_t new_size = ifp->if_bytes + byte_diff; ASSERT(new_size >= 0); - ASSERT(new_size <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); if (byte_diff == 0) return; @@ -614,7 +614,7 @@ xfs_iflush_fork( if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); } break; @@ -633,7 +633,7 @@ xfs_iflush_fork( (ifp->if_broot_bytes > 0)) { ASSERT(ifp->if_broot != NULL); ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= - XFS_IFORK_SIZE(ip, whichfork)); + xfs_inode_fork_size(ip, whichfork)); xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, (xfs_bmdr_block_t *)cp, XFS_DFORK_SIZE(dip, mp, whichfork)); diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index efee11956540..d3943d6ad0b9 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -77,21 +77,8 @@ struct xfs_ifork { /* * Fork handling. */ - -#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_forkoff << 3)) - -#define XFS_IFORK_DSIZE(ip) \ - (xfs_inode_has_attr_fork(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount)) -#define XFS_IFORK_ASIZE(ip) \ - (xfs_inode_has_attr_fork(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0) -#define XFS_IFORK_SIZE(ip,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_IFORK_DSIZE(ip) : \ - ((w) == XFS_ATTR_FORK ? \ - XFS_IFORK_ASIZE(ip) : \ - 0)) #define XFS_IFORK_MAXEXT(ip, w) \ - (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + (xfs_inode_fork_size(ip, w) / sizeof(xfs_bmbt_rec_t)) static inline bool xfs_ifork_has_extents(struct xfs_ifork *ifp) { diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 0215f014e164..75311f8daeeb 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -52,8 +52,8 @@ xchk_symlink( /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { - if (len > XFS_IFORK_DSIZE(ip) || - len > strnlen(ifp->if_u1.if_data, XFS_IFORK_DSIZE(ip))) + if (len > xfs_inode_data_fork_size(ip) || + len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); goto out; } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 8111319cbb46..74f96e1aa5cd 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1321,7 +1321,7 @@ xfs_swap_extents_check_format( */ if (tifp->if_format == XFS_DINODE_FMT_BTREE) { if (xfs_inode_has_attr_fork(ip) && - XFS_BMAP_BMDR_SPACE(tifp->if_broot) > XFS_IFORK_BOFF(ip)) + XFS_BMAP_BMDR_SPACE(tifp->if_broot) > xfs_inode_fork_boff(ip)) return -EINVAL; if (tifp->if_nextents <= XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) return -EINVAL; @@ -1330,7 +1330,7 @@ xfs_swap_extents_check_format( /* Reciprocal target->temp btree format checks */ if (ifp->if_format == XFS_DINODE_FMT_BTREE) { if (xfs_inode_has_attr_fork(tip) && - XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > xfs_inode_fork_boff(tip)) return -EINVAL; if (ifp->if_nextents <= XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) return -EINVAL; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 2badbf9bb80d..7ff828504b3c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -102,6 +102,41 @@ xfs_ifork_ptr( } } +static inline unsigned int xfs_inode_fork_boff(struct xfs_inode *ip) +{ + return ip->i_forkoff << 3; +} + +static inline unsigned int xfs_inode_data_fork_size(struct xfs_inode *ip) +{ + if (xfs_inode_has_attr_fork(ip)) + return xfs_inode_fork_boff(ip); + + return XFS_LITINO(ip->i_mount); +} + +static inline unsigned int xfs_inode_attr_fork_size(struct xfs_inode *ip) +{ + if (xfs_inode_has_attr_fork(ip)) + return XFS_LITINO(ip->i_mount) - xfs_inode_fork_boff(ip); + return 0; +} + +static inline unsigned int +xfs_inode_fork_size( + struct xfs_inode *ip, + int whichfork) +{ + switch (whichfork) { + case XFS_DATA_FORK: + return xfs_inode_data_fork_size(ip); + case XFS_ATTR_FORK: + return xfs_inode_attr_fork_size(ip); + default: + return 0; + } +} + /* Convert from vfs inode to xfs inode */ static inline struct xfs_inode *XFS_I(struct inode *inode) { diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index a78a0b9dd1d0..6e19ece916bf 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -57,7 +57,7 @@ xfs_inode_item_data_fork_size( ip->i_df.if_nextents > 0 && ip->i_df.if_bytes > 0) { /* worst case, doesn't subtract delalloc extents */ - *nbytes += XFS_IFORK_DSIZE(ip); + *nbytes += xfs_inode_data_fork_size(ip); *nvecs += 1; } break; @@ -98,7 +98,7 @@ xfs_inode_item_attr_fork_size( ip->i_af.if_nextents > 0 && ip->i_af.if_bytes > 0) { /* worst case, doesn't subtract unused space */ - *nbytes += XFS_IFORK_ASIZE(ip); + *nbytes += xfs_inode_attr_fork_size(ip); *nvecs += 1; } break; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7d1ff282953f..36312b00b164 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -112,7 +112,7 @@ xfs_bulkstat_one_int( xfs_bulkstat_health(ip, buf); buf->bs_aextents = xfs_ifork_nextents(&ip->i_af); - buf->bs_forkoff = XFS_IFORK_BOFF(ip); + buf->bs_forkoff = xfs_inode_fork_boff(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; if (xfs_has_v3inodes(mp)) { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 4145ba872547..8389f3ef88ef 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -256,7 +256,7 @@ xfs_symlink( /* * If the symlink will fit into the inode, write it inline. */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { + if (pathlen <= xfs_inode_data_fork_size(ip)) { xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen); ip->i_disk_size = pathlen; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0fa1b7a2918c..8abc49af0d72 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->format = ip->i_df.if_format; __entry->nex = ip->i_df.if_nextents; __entry->broot_size = ip->i_df.if_broot_bytes; - __entry->fork_off = XFS_IFORK_BOFF(ip); + __entry->fork_off = xfs_inode_fork_boff(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, " "broot size %d, forkoff 0x%x", -- cgit v1.2.3 From 04a98a036cf8b810dda172a9dcfcbd783bf63655 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 14 Jul 2022 11:36:36 +1000 Subject: xfs: flush inode gc workqueue before clearing agi bucket In the procedure of recover AGI unlinked lists, if something bad happenes on one of the unlinked inode in the bucket list, we would call xlog_recover_clear_agi_bucket() to clear the whole unlinked bucket list, not the unlinked inodes after the bad one. If we have already added some inodes to the gc workqueue before the bad inode in the list, we could get below error when freeing those inodes, and finaly fail to complete the log recover procedure. XFS (ram0): Internal error xfs_iunlink_remove at line 2456 of file fs/xfs/xfs_inode.c. Caller xfs_ifree+0xb0/0x360 [xfs] The problem is xlog_recover_clear_agi_bucket() clear the bucket list, so the gc worker fail to check the agino in xfs_verify_agino(). Fix this by flush workqueue before clearing the bucket. Fixes: ab23a7768739 ("xfs: per-cpu deferred inode inactivation queues") Signed-off-by: Zhang Yi Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3e8c62c6c2b1..bc687459951b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2714,6 +2714,7 @@ xlog_recover_process_one_iunlink( * Call xlog_recover_clear_agi_bucket() to perform a transaction to * clear the inode pointer in the bucket. */ + xfs_inodegc_flush(pag->pag_mount); xlog_recover_clear_agi_bucket(pag, bucket); return NULLAGINO; } -- cgit v1.2.3 From a4454cd69c66bf3e3bbda352b049732f836fc6b2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:36:40 +1000 Subject: xfs: factor the xfs_iunlink functions Prep work that separates the locking that protects the unlinked list from the actual operations being performed. This also helps document the fact they are performing list insert and remove operations. No functional code change. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 108 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 42 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 482e1ee2d669..69bca88fc8ed 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2115,39 +2115,20 @@ out: return error; } -/* - * This is called when the inode's link count has gone to 0 or we are creating - * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. - * - * We place the on-disk inode on a list in the AGI. It will be pulled from this - * list when the inode is freed. - */ -STATIC int -xfs_iunlink( +static int +xfs_iunlink_insert_inode( struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agibp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_perag *pag; - struct xfs_agi *agi; - struct xfs_buf *agibp; + struct xfs_agi *agi = agibp->b_addr; xfs_agino_t next_agino; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; - ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(VFS_I(ip)->i_mode != 0); - trace_xfs_iunlink(ip); - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - - /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, &agibp); - if (error) - goto out; - agi = agibp->b_addr; - /* * Get the index into the agi hash table for the list this inode will * go on. Make sure the pointer isn't garbage and that this inode @@ -2157,8 +2138,7 @@ xfs_iunlink( if (next_agino == agino || !xfs_verify_agino_or_null(pag, next_agino)) { xfs_buf_mark_corrupt(agibp); - error = -EFSCORRUPTED; - goto out; + return -EFSCORRUPTED; } if (next_agino != NULLAGINO) { @@ -2171,7 +2151,7 @@ xfs_iunlink( error = xfs_iunlink_update_inode(tp, ip, pag, next_agino, &old_agino); if (error) - goto out; + return error; ASSERT(old_agino == NULLAGINO); /* @@ -2180,11 +2160,42 @@ xfs_iunlink( */ error = xfs_iunlink_add_backref(pag, agino, next_agino); if (error) - goto out; + return error; } /* Point the head of the list to point to this inode. */ - error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); +} + +/* + * This is called when the inode's link count has gone to 0 or we are creating + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. + * + * We place the on-disk inode on a list in the AGI. It will be pulled from this + * list when the inode is freed. + */ +STATIC int +xfs_iunlink( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + struct xfs_buf *agibp; + int error; + + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + trace_xfs_iunlink(ip); + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, &agibp); + if (error) + goto out; + + error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); out: xfs_perag_put(pag); return error; @@ -2305,18 +2316,15 @@ xfs_iunlink_map_prev( return 0; } -/* - * Pull the on-disk inode from the AGI unlinked list. - */ -STATIC int -xfs_iunlink_remove( +static int +xfs_iunlink_remove_inode( struct xfs_trans *tp, struct xfs_perag *pag, + struct xfs_buf *agibp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi; - struct xfs_buf *agibp; + struct xfs_agi *agi = agibp->b_addr; struct xfs_buf *last_ibp; struct xfs_dinode *last_dip = NULL; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); @@ -2327,12 +2335,6 @@ xfs_iunlink_remove( trace_xfs_iunlink_remove(ip); - /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(pag, tp, &agibp); - if (error) - return error; - agi = agibp->b_addr; - /* * Get the index into the agi hash table for the list this inode will * go on. Make sure the head pointer isn't garbage. @@ -2397,6 +2399,28 @@ xfs_iunlink_remove( next_agino); } +/* + * Pull the on-disk inode from the AGI unlinked list. + */ +STATIC int +xfs_iunlink_remove( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + struct xfs_buf *agibp; + int error; + + trace_xfs_iunlink_remove(ip); + + /* Get the agi buffer first. It ensures lock ordering on the list. */ + error = xfs_read_agi(pag, tp, &agibp); + if (error) + return error; + + return xfs_iunlink_remove_inode(tp, pag, agibp, ip); +} + /* * Look up the inode number specified and if it is not already marked XFS_ISTALE * mark it stale. We should only find clean inodes in this lookup that aren't -- cgit v1.2.3 From 4fcc94d653270fcc7800dbaf3b11f78cb462b293 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:38:54 +1000 Subject: xfs: track the iunlink list pointer in the xfs_inode Having direct access to the i_next_unlinked pointer in unlinked inodes greatly simplifies the processing of inodes on the unlinked list. We no longer need to look up the inode buffer just to find next inode in the list if the xfs_inode is in memory. These improvements will be realised over upcoming patches as other dependencies on the inode buffer for unlinked list processing are removed. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_inode_buf.c | 3 ++- fs/xfs/xfs_inode.c | 5 ++++- fs/xfs/xfs_inode.h | 3 +++ fs/xfs/xfs_log_recover.c | 17 +---------------- 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 3a12bd3c7c97..806f209defed 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -229,7 +229,8 @@ xfs_inode_from_disk( ip->i_nblocks = be64_to_cpu(from->di_nblocks); ip->i_extsize = be32_to_cpu(from->di_extsize); ip->i_forkoff = from->di_forkoff; - ip->i_diflags = be16_to_cpu(from->di_flags); + ip->i_diflags = be16_to_cpu(from->di_flags); + ip->i_next_unlinked = be32_to_cpu(from->di_next_unlinked); if (from->di_dmevmask || from->di_dmstate) xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 69bca88fc8ed..4055fb4aa968 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2084,7 +2084,8 @@ xfs_iunlink_update_inode( /* Make sure the old pointer isn't garbage. */ old_value = be32_to_cpu(dip->di_next_unlinked); - if (!xfs_verify_agino_or_null(pag, old_value)) { + if (old_value != ip->i_next_unlinked || + !xfs_verify_agino_or_null(pag, old_value)) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, sizeof(*dip), __this_address); error = -EFSCORRUPTED; @@ -2153,6 +2154,7 @@ xfs_iunlink_insert_inode( if (error) return error; ASSERT(old_agino == NULLAGINO); + ip->i_next_unlinked = next_agino; /* * agino has been unlinked, add a backref from the next inode @@ -2354,6 +2356,7 @@ xfs_iunlink_remove_inode( error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino); if (error) return error; + ip->i_next_unlinked = NULLAGINO; /* * If there was a backref pointing from the next inode back to this diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7be6f8e705ab..8e2a33c6cbe2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -68,6 +68,9 @@ typedef struct xfs_inode { uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ + /* unlinked list pointers */ + xfs_agino_t i_next_unlinked; + /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index bc687459951b..d76e54cbb400 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2673,8 +2673,6 @@ xlog_recover_process_one_iunlink( xfs_agino_t agino, int bucket) { - struct xfs_buf *ibp; - struct xfs_dinode *dip; struct xfs_inode *ip; xfs_ino_t ino; int error; @@ -2684,27 +2682,14 @@ xlog_recover_process_one_iunlink( if (error) goto fail; - /* - * Get the on disk inode to find the next inode in the bucket. - */ - error = xfs_imap_to_bp(pag->pag_mount, NULL, &ip->i_imap, &ibp); - if (error) - goto fail_iput; - dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); - xfs_iflags_clear(ip, XFS_IRECOVERY); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); - /* setup for the next pass */ - agino = be32_to_cpu(dip->di_next_unlinked); - xfs_buf_relse(ibp); - + agino = ip->i_next_unlinked; xfs_irele(ip); return agino; - fail_iput: - xfs_irele(ip); fail: /* * We can't read in the inode this bucket points to, or this inode -- cgit v1.2.3 From 04755d2e5821b3afbaadd09fe5df58d04de36484 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:42:39 +1000 Subject: xfs: refactor xlog_recover_process_iunlinks() For upcoming changes to the way inode unlinked list processing is done, the structure of recovery needs to change slightly. We also really need to untangle the messy error handling in list recovery so that actions like emptying the bucket on inode lookup failure are associated with the bucket list walk failing, not failing to look up the inode. Refactor the recovery code now to keep the re-organisation seperate to the algorithm changes. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 137 ++++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 66 deletions(-) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index d76e54cbb400..8d28487813b0 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2667,41 +2667,35 @@ out_error: return; } -STATIC xfs_agino_t -xlog_recover_process_one_iunlink( - struct xfs_perag *pag, - xfs_agino_t agino, - int bucket) +static int +xlog_recover_iunlink_bucket( + struct xfs_perag *pag, + struct xfs_agi *agi, + int bucket) { - struct xfs_inode *ip; - xfs_ino_t ino; - int error; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *ip; + xfs_agino_t agino; - ino = XFS_AGINO_TO_INO(pag->pag_mount, pag->pag_agno, agino); - error = xfs_iget(pag->pag_mount, NULL, ino, 0, 0, &ip); - if (error) - goto fail; + agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (agino != NULLAGINO) { + int error; - xfs_iflags_clear(ip, XFS_IRECOVERY); - ASSERT(VFS_I(ip)->i_nlink == 0); - ASSERT(VFS_I(ip)->i_mode != 0); + error = xfs_iget(mp, NULL, + XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), + 0, 0, &ip); + if (error) + return error;; - agino = ip->i_next_unlinked; - xfs_irele(ip); - return agino; + ASSERT(VFS_I(ip)->i_nlink == 0); + ASSERT(VFS_I(ip)->i_mode != 0); + xfs_iflags_clear(ip, XFS_IRECOVERY); + agino = ip->i_next_unlinked; - fail: - /* - * We can't read in the inode this bucket points to, or this inode - * is messed up. Just ditch this bucket of inodes. We will lose - * some inodes and space, but at least we won't hang. - * - * Call xlog_recover_clear_agi_bucket() to perform a transaction to - * clear the inode pointer in the bucket. - */ - xfs_inodegc_flush(pag->pag_mount); - xlog_recover_clear_agi_bucket(pag, bucket); - return NULLAGINO; + xfs_irele(ip); + cond_resched(); + } + return 0; } /* @@ -2727,59 +2721,70 @@ xlog_recover_process_one_iunlink( * scheduled on this CPU to ensure other scheduled work can run without undue * latency. */ -STATIC void -xlog_recover_process_iunlinks( - struct xlog *log) +static void +xlog_recover_iunlink_ag( + struct xfs_perag *pag) { - struct xfs_mount *mp = log->l_mp; - struct xfs_perag *pag; - xfs_agnumber_t agno; struct xfs_agi *agi; struct xfs_buf *agibp; - xfs_agino_t agino; int bucket; int error; - for_each_perag(mp, agno, pag) { - error = xfs_read_agi(pag, NULL, &agibp); + error = xfs_read_agi(pag, NULL, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt after we've + * recovered all the ag's we can.... + */ + return; + } + + /* + * Unlock the buffer so that it can be acquired in the normal course of + * the transaction to truncate and free each inode. Because we are not + * racing with anyone else here for the AGI buffer, we don't even need + * to hold it locked to read the initial unlinked bucket entries out of + * the buffer. We keep buffer reference though, so that it stays pinned + * in memory while we need the buffer. + */ + agi = agibp->b_addr; + xfs_buf_unlock(agibp); + + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { + error = xlog_recover_iunlink_bucket(pag, agi, bucket); if (error) { /* - * AGI is b0rked. Don't process it. - * - * We should probably mark the filesystem as corrupt - * after we've recovered all the ag's we can.... + * Bucket is unrecoverable, so only a repair scan can + * free the remaining unlinked inodes. Just empty the + * bucket and remaining inodes on it unreferenced and + * unfreeable. */ - continue; - } - /* - * Unlock the buffer so that it can be acquired in the normal - * course of the transaction to truncate and free each inode. - * Because we are not racing with anyone else here for the AGI - * buffer, we don't even need to hold it locked to read the - * initial unlinked bucket entries out of the buffer. We keep - * buffer reference though, so that it stays pinned in memory - * while we need the buffer. - */ - agi = agibp->b_addr; - xfs_buf_unlock(agibp); - - for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { - agino = be32_to_cpu(agi->agi_unlinked[bucket]); - while (agino != NULLAGINO) { - agino = xlog_recover_process_one_iunlink(pag, - agino, bucket); - cond_resched(); - } + xfs_inodegc_flush(pag->pag_mount); + xlog_recover_clear_agi_bucket(pag, bucket); } - xfs_buf_rele(agibp); } + xfs_buf_rele(agibp); +} + +static void +xlog_recover_process_iunlinks( + struct xlog *log) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + for_each_perag(log->l_mp, agno, pag) + xlog_recover_iunlink_ag(pag); + /* * Flush the pending unlinked inodes to ensure that the inactivations * are fully completed on disk and the incore inodes can be reclaimed * before we signal that recovery is complete. */ - xfs_inodegc_flush(mp); + xfs_inodegc_flush(log->l_mp); } STATIC void -- cgit v1.2.3 From a83d5a8b1d946264e24299d6697bb03fe5198668 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:43:09 +1000 Subject: xfs: introduce xfs_iunlink_lookup When an inode is on an unlinked list during normal operation, it is guaranteed to be pinned in memory as it is either referenced by the current unlink operation or it has a open file descriptor that references it and has it pinned in memory. Hence to look up an inode on the unlinked list, we can do a direct inode cache lookup and always expect the lookup to succeed. Add a function to do this lookup based on the agino that we use to link the chain of unlinked inodes together so we can begin the conversion the unlinked list manipulations to use in-memory inodes rather than inode cluster buffers and remove the backref cache. Use this lookup function to replace the on-disk inode buffer walk when removing inodes from the unlinked list with an in-core inode unlinked list walk. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 161 ++++++++++++++++++++++------------------------------- fs/xfs/xfs_trace.h | 1 - 2 files changed, 66 insertions(+), 96 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4055fb4aa968..7153e3cc2627 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1992,6 +1992,35 @@ xfs_iunlink_destroy( ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount)); } +/* + * Find an inode on the unlinked list. This does not take references to the + * inode as we have existence guarantees by holding the AGI buffer lock and that + * only unlinked, referenced inodes can be on the unlinked inode list. If we + * don't find the inode in cache, then let the caller handle the situation. + */ +static struct xfs_inode * +xfs_iunlink_lookup( + struct xfs_perag *pag, + xfs_agino_t agino) +{ + struct xfs_inode *ip; + + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + /* + * Inode not in memory or in RCU freeing limbo should not happen. + * Warn about this and let the caller handle the failure. + */ + if (WARN_ON_ONCE(!ip || !ip->i_ino)) { + rcu_read_unlock(); + return NULL; + } + ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); + rcu_read_unlock(); + return ip; +} + /* * Point the AGI unlinked bucket at an inode and log the results. The caller * is responsible for validating the old value. @@ -2097,7 +2126,8 @@ xfs_iunlink_update_inode( * current pointer is the same as the new value, unless we're * terminating the list. */ - *old_next_agino = old_value; + if (old_next_agino) + *old_next_agino = old_value; if (old_value == next_agino) { if (next_agino != NULLAGINO) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, @@ -2203,38 +2233,6 @@ out: return error; } -/* Return the imap, dinode pointer, and buffer for an inode. */ -STATIC int -xfs_iunlink_map_ino( - struct xfs_trans *tp, - xfs_agnumber_t agno, - xfs_agino_t agino, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp) -{ - struct xfs_mount *mp = tp->t_mountp; - int error; - - imap->im_blkno = 0; - error = xfs_imap(mp, tp, XFS_AGINO_TO_INO(mp, agno, agino), imap, 0); - if (error) { - xfs_warn(mp, "%s: xfs_imap returned error %d.", - __func__, error); - return error; - } - - error = xfs_imap_to_bp(mp, tp, imap, bpp); - if (error) { - xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", - __func__, error); - return error; - } - - *dipp = xfs_buf_offset(*bpp, imap->im_boffset); - return 0; -} - /* * Walk the unlinked chain from @head_agino until we find the inode that * points to @target_agino. Return the inode number, map, dinode pointer, @@ -2245,77 +2243,49 @@ xfs_iunlink_map_ino( * * Do not call this function if @target_agino is the head of the list. */ -STATIC int -xfs_iunlink_map_prev( - struct xfs_trans *tp, +static int +xfs_iunlink_lookup_prev( struct xfs_perag *pag, xfs_agino_t head_agino, xfs_agino_t target_agino, - xfs_agino_t *agino, - struct xfs_imap *imap, - struct xfs_dinode **dipp, - struct xfs_buf **bpp) + struct xfs_inode **ipp) { - struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip; xfs_agino_t next_agino; - int error; - - ASSERT(head_agino != target_agino); - *bpp = NULL; - /* See if our backref cache can find it faster. */ - *agino = xfs_iunlink_lookup_backref(pag, target_agino); - if (*agino != NULLAGINO) { - error = xfs_iunlink_map_ino(tp, pag->pag_agno, *agino, imap, - dipp, bpp); - if (error) - return error; + *ipp = NULL; - if (be32_to_cpu((*dipp)->di_next_unlinked) == target_agino) + next_agino = xfs_iunlink_lookup_backref(pag, target_agino); + if (next_agino != NULLAGINO) { + ip = xfs_iunlink_lookup(pag, next_agino); + if (ip && ip->i_next_unlinked == target_agino) { + *ipp = ip; return 0; - - /* - * If we get here the cache contents were corrupt, so drop the - * buffer and fall back to walking the bucket list. - */ - xfs_trans_brelse(tp, *bpp); - *bpp = NULL; - WARN_ON_ONCE(1); + } } - trace_xfs_iunlink_map_prev_fallback(mp, pag->pag_agno); - /* Otherwise, walk the entire bucket until we find it. */ next_agino = head_agino; - while (next_agino != target_agino) { - xfs_agino_t unlinked_agino; - - if (*bpp) - xfs_trans_brelse(tp, *bpp); + while (next_agino != NULLAGINO) { + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -EFSCORRUPTED; - *agino = next_agino; - error = xfs_iunlink_map_ino(tp, pag->pag_agno, next_agino, imap, - dipp, bpp); - if (error) - return error; - - unlinked_agino = be32_to_cpu((*dipp)->di_next_unlinked); /* * Make sure this pointer is valid and isn't an obvious * infinite loop. */ - if (!xfs_verify_agino(pag, unlinked_agino) || - next_agino == unlinked_agino) { - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, mp, - *dipp, sizeof(**dipp)); - error = -EFSCORRUPTED; - return error; + if (!xfs_verify_agino(pag, ip->i_next_unlinked) || + next_agino == ip->i_next_unlinked) + return -EFSCORRUPTED; + + if (ip->i_next_unlinked == target_agino) { + *ipp = ip; + return 0; } - next_agino = unlinked_agino; + next_agino = ip->i_next_unlinked; } - - return 0; + return -EFSCORRUPTED; } static int @@ -2327,8 +2297,6 @@ xfs_iunlink_remove_inode( { struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agibp->b_addr; - struct xfs_buf *last_ibp; - struct xfs_dinode *last_dip = NULL; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino; xfs_agino_t head_agino; @@ -2356,7 +2324,6 @@ xfs_iunlink_remove_inode( error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino); if (error) return error; - ip->i_next_unlinked = NULLAGINO; /* * If there was a backref pointing from the next inode back to this @@ -2372,18 +2339,21 @@ xfs_iunlink_remove_inode( } if (head_agino != agino) { - struct xfs_imap imap; - xfs_agino_t prev_agino; + struct xfs_inode *prev_ip; - /* We need to search the list for the inode being freed. */ - error = xfs_iunlink_map_prev(tp, pag, head_agino, agino, - &prev_agino, &imap, &last_dip, &last_ibp); + error = xfs_iunlink_lookup_prev(pag, head_agino, agino, + &prev_ip); if (error) return error; /* Point the previous inode on the list to the next inode. */ - xfs_iunlink_update_dinode(tp, pag, prev_agino, last_ibp, - last_dip, &imap, next_agino); + error = xfs_iunlink_update_inode(tp, prev_ip, pag, next_agino, + NULL); + if (error) + return error; + + prev_ip->i_next_unlinked = ip->i_next_unlinked; + ip->i_next_unlinked = NULLAGINO; /* * Now we deal with the backref for this inode. If this inode @@ -2398,6 +2368,7 @@ xfs_iunlink_remove_inode( } /* Point the head of the list to the next unlinked inode. */ + ip->i_next_unlinked = NULLAGINO; return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, next_agino); } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0fa1b7a2918c..926543f01335 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3672,7 +3672,6 @@ DEFINE_EVENT(xfs_ag_inode_class, name, \ TP_ARGS(ip)) DEFINE_AGINODE_EVENT(xfs_iunlink); DEFINE_AGINODE_EVENT(xfs_iunlink_remove); -DEFINE_AG_EVENT(xfs_iunlink_map_prev_fallback); DECLARE_EVENT_CLASS(xfs_fs_corrupt_class, TP_PROTO(struct xfs_mount *mp, unsigned int flags), -- cgit v1.2.3 From 2fd26cc07e9f8050e29bf314cbf1bcb64dbe088c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:46:43 +1000 Subject: xfs: double link the unlinked inode list Now we have forwards traversal via the incore inode in place, we now need to add back pointers to the incore inode to entirely replace the back reference cache. We use the same lookup semantics and constraints as for the forwards pointer lookups during unlinks, and so we can look up any inode in the unlinked list directly and update the list pointers, forwards or backwards, at any time. The only wrinkle in converting the unlinked list manipulations to use in-core previous pointers is that log recovery doesn't have the incore inode state built up so it can't just read in an inode and release it to finish off the unlink. Hence we need to modify the traversal in recovery to read one inode ahead before we release the inode at the head of the list. This populates the next->prev relationship sufficient to be able to replay the unlinked list and hence greatly simplify the runtime code. This recovery algorithm also requires that we actually remove inodes from the unlinked list one at a time as background inode inactivation will result in unlinked list removal racing with the building of the in-memory unlinked list state. We could serialise this by holding the AGI buffer lock when constructing the in memory state, but all that does is lockstep background processing with list building. It is much simpler to flush the inodegc immediately after releasing the inode so that it is unlinked immediately and there is no races present at all. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 8 -- fs/xfs/libxfs/xfs_ag.h | 6 - fs/xfs/xfs_icache.c | 2 + fs/xfs/xfs_inode.c | 344 ++++++++--------------------------------------- fs/xfs/xfs_inode.h | 4 +- fs/xfs/xfs_log_recover.c | 36 ++++- 6 files changed, 90 insertions(+), 310 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 71f5dae7ad6c..bb0c700afe3c 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -194,7 +194,6 @@ xfs_free_perag( XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_iunlink_destroy(pag); xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); @@ -323,10 +322,6 @@ xfs_initialize_perag( if (error) goto out_remove_pag; - error = xfs_iunlink_init(pag); - if (error) - goto out_hash_destroy; - /* first new pag is fully initialized */ if (first_initialised == NULLAGNUMBER) first_initialised = index; @@ -349,8 +344,6 @@ xfs_initialize_perag( mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp); return 0; -out_hash_destroy: - xfs_buf_hash_destroy(pag); out_remove_pag: radix_tree_delete(&mp->m_perag_tree, index); out_free_pag: @@ -362,7 +355,6 @@ out_unwind_new_pags: if (!pag) break; xfs_buf_hash_destroy(pag); - xfs_iunlink_destroy(pag); kmem_free(pag); } return error; diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 75f7c10c110a..517a138faa66 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -103,12 +103,6 @@ struct xfs_perag { /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; - /* - * Unlinked inode information. This incore information reflects - * data stored in the AGI, so callers must hold the AGI buffer lock - * or have some other means to control concurrency. - */ - struct rhashtable pagi_unlinked_hash; #endif /* __KERNEL__ */ }; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 2609825d53ee..7f4cc341aacc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -111,6 +111,8 @@ xfs_inode_alloc( INIT_WORK(&ip->i_ioend_work, xfs_end_io); INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); + ip->i_next_unlinked = NULLAGINO; + ip->i_prev_unlinked = NULLAGINO; return ip; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7153e3cc2627..63af7680cf73 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1801,197 +1801,22 @@ out: * because we must walk that list to find the inode that points to the inode * being removed from the unlinked hash bucket list. * - * What if we modelled the unlinked list as a collection of records capturing - * "X.next_unlinked = Y" relations? If we indexed those records on Y, we'd - * have a fast way to look up unlinked list predecessors, which avoids the - * slow list walk. That's exactly what we do here (in-core) with a per-AG - * rhashtable. + * Hence we keep an in-memory double linked list to link each inode on an + * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer + * based lists would require having 64 list heads in the perag, one for each + * list. This is expensive in terms of memory (think millions of AGs) and cache + * misses on lookups. Instead, use the fact that inodes on the unlinked list + * must be referenced at the VFS level to keep them on the list and hence we + * have an existence guarantee for inodes on the unlinked list. * - * Because this is a backref cache, we ignore operational failures since the - * iunlink code can fall back to the slow bucket walk. The only errors that - * should bubble out are for obviously incorrect situations. - * - * All users of the backref cache MUST hold the AGI buffer lock to serialize - * access or have otherwise provided for concurrency control. + * Given we have an existence guarantee, we can use lockless inode cache lookups + * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode + * for the double linked unlinked list, and we don't need any extra locking to + * keep the list safe as all manipulations are done under the AGI buffer lock. + * Keeping the list up to date does not require memory allocation, just finding + * the XFS inode and updating the next/prev unlinked list aginos. */ -/* Capture a "X.next_unlinked = Y" relationship. */ -struct xfs_iunlink { - struct rhash_head iu_rhash_head; - xfs_agino_t iu_agino; /* X */ - xfs_agino_t iu_next_unlinked; /* Y */ -}; - -/* Unlinked list predecessor lookup hashtable construction */ -static int -xfs_iunlink_obj_cmpfn( - struct rhashtable_compare_arg *arg, - const void *obj) -{ - const xfs_agino_t *key = arg->key; - const struct xfs_iunlink *iu = obj; - - if (iu->iu_next_unlinked != *key) - return 1; - return 0; -} - -static const struct rhashtable_params xfs_iunlink_hash_params = { - .min_size = XFS_AGI_UNLINKED_BUCKETS, - .key_len = sizeof(xfs_agino_t), - .key_offset = offsetof(struct xfs_iunlink, - iu_next_unlinked), - .head_offset = offsetof(struct xfs_iunlink, iu_rhash_head), - .automatic_shrinking = true, - .obj_cmpfn = xfs_iunlink_obj_cmpfn, -}; - -/* - * Return X, where X.next_unlinked == @agino. Returns NULLAGINO if no such - * relation is found. - */ -static xfs_agino_t -xfs_iunlink_lookup_backref( - struct xfs_perag *pag, - xfs_agino_t agino) -{ - struct xfs_iunlink *iu; - - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - return iu ? iu->iu_agino : NULLAGINO; -} - -/* - * Take ownership of an iunlink cache entry and insert it into the hash table. - * If successful, the entry will be owned by the cache; if not, it is freed. - * Either way, the caller does not own @iu after this call. - */ -static int -xfs_iunlink_insert_backref( - struct xfs_perag *pag, - struct xfs_iunlink *iu) -{ - int error; - - error = rhashtable_insert_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); - /* - * Fail loudly if there already was an entry because that's a sign of - * corruption of in-memory data. Also fail loudly if we see an error - * code we didn't anticipate from the rhashtable code. Currently we - * only anticipate ENOMEM. - */ - if (error) { - WARN(error != -ENOMEM, "iunlink cache insert error %d", error); - kmem_free(iu); - } - /* - * Absorb any runtime errors that aren't a result of corruption because - * this is a cache and we can always fall back to bucket list scanning. - */ - if (error != 0 && error != -EEXIST) - error = 0; - return error; -} - -/* Remember that @prev_agino.next_unlinked = @this_agino. */ -static int -xfs_iunlink_add_backref( - struct xfs_perag *pag, - xfs_agino_t prev_agino, - xfs_agino_t this_agino) -{ - struct xfs_iunlink *iu; - - if (XFS_TEST_ERROR(false, pag->pag_mount, XFS_ERRTAG_IUNLINK_FALLBACK)) - return 0; - - iu = kmem_zalloc(sizeof(*iu), KM_NOFS); - iu->iu_agino = prev_agino; - iu->iu_next_unlinked = this_agino; - - return xfs_iunlink_insert_backref(pag, iu); -} - -/* - * Replace X.next_unlinked = @agino with X.next_unlinked = @next_unlinked. - * If @next_unlinked is NULLAGINO, we drop the backref and exit. If there - * wasn't any such entry then we don't bother. - */ -static int -xfs_iunlink_change_backref( - struct xfs_perag *pag, - xfs_agino_t agino, - xfs_agino_t next_unlinked) -{ - struct xfs_iunlink *iu; - int error; - - /* Look up the old entry; if there wasn't one then exit. */ - iu = rhashtable_lookup_fast(&pag->pagi_unlinked_hash, &agino, - xfs_iunlink_hash_params); - if (!iu) - return 0; - - /* - * Remove the entry. This shouldn't ever return an error, but if we - * couldn't remove the old entry we don't want to add it again to the - * hash table, and if the entry disappeared on us then someone's - * violated the locking rules and we need to fail loudly. Either way - * we cannot remove the inode because internal state is or would have - * been corrupt. - */ - error = rhashtable_remove_fast(&pag->pagi_unlinked_hash, - &iu->iu_rhash_head, xfs_iunlink_hash_params); - if (error) - return error; - - /* If there is no new next entry just free our item and return. */ - if (next_unlinked == NULLAGINO) { - kmem_free(iu); - return 0; - } - - /* Update the entry and re-add it to the hash table. */ - iu->iu_next_unlinked = next_unlinked; - return xfs_iunlink_insert_backref(pag, iu); -} - -/* Set up the in-core predecessor structures. */ -int -xfs_iunlink_init( - struct xfs_perag *pag) -{ - return rhashtable_init(&pag->pagi_unlinked_hash, - &xfs_iunlink_hash_params); -} - -/* Free the in-core predecessor structures. */ -static void -xfs_iunlink_free_item( - void *ptr, - void *arg) -{ - struct xfs_iunlink *iu = ptr; - bool *freed_anything = arg; - - *freed_anything = true; - kmem_free(iu); -} - -void -xfs_iunlink_destroy( - struct xfs_perag *pag) -{ - bool freed_anything = false; - - rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, - xfs_iunlink_free_item, &freed_anything); - - ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount)); -} - /* * Find an inode on the unlinked list. This does not take references to the * inode as we have existence guarantees by holding the AGI buffer lock and that @@ -2021,6 +1846,26 @@ xfs_iunlink_lookup( return ip; } +/* Update the prev pointer of the next agino. */ +static int +xfs_iunlink_update_backref( + struct xfs_perag *pag, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_inode *ip; + + /* No update necessary if we are at the end of the list. */ + if (next_agino == NULLAGINO) + return 0; + + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) + return -EFSCORRUPTED; + ip->i_prev_unlinked = prev_agino; + return 0; +} + /* * Point the AGI unlinked bucket at an inode and log the results. The caller * is responsible for validating the old value. @@ -2172,6 +2017,14 @@ xfs_iunlink_insert_inode( return -EFSCORRUPTED; } + /* + * Update the prev pointer in the next inode to point back to this + * inode. + */ + error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error) + return error; + if (next_agino != NULLAGINO) { xfs_agino_t old_agino; @@ -2185,14 +2038,6 @@ xfs_iunlink_insert_inode( return error; ASSERT(old_agino == NULLAGINO); ip->i_next_unlinked = next_agino; - - /* - * agino has been unlinked, add a backref from the next inode - * back to agino. - */ - error = xfs_iunlink_add_backref(pag, agino, next_agino); - if (error) - return error; } /* Point the head of the list to point to this inode. */ @@ -2233,61 +2078,6 @@ out: return error; } -/* - * Walk the unlinked chain from @head_agino until we find the inode that - * points to @target_agino. Return the inode number, map, dinode pointer, - * and inode cluster buffer of that inode as @agino, @imap, @dipp, and @bpp. - * - * @tp, @pag, @head_agino, and @target_agino are input parameters. - * @agino, @imap, @dipp, and @bpp are all output parameters. - * - * Do not call this function if @target_agino is the head of the list. - */ -static int -xfs_iunlink_lookup_prev( - struct xfs_perag *pag, - xfs_agino_t head_agino, - xfs_agino_t target_agino, - struct xfs_inode **ipp) -{ - struct xfs_inode *ip; - xfs_agino_t next_agino; - - *ipp = NULL; - - next_agino = xfs_iunlink_lookup_backref(pag, target_agino); - if (next_agino != NULLAGINO) { - ip = xfs_iunlink_lookup(pag, next_agino); - if (ip && ip->i_next_unlinked == target_agino) { - *ipp = ip; - return 0; - } - } - - /* Otherwise, walk the entire bucket until we find it. */ - next_agino = head_agino; - while (next_agino != NULLAGINO) { - ip = xfs_iunlink_lookup(pag, next_agino); - if (!ip) - return -EFSCORRUPTED; - - /* - * Make sure this pointer is valid and isn't an obvious - * infinite loop. - */ - if (!xfs_verify_agino(pag, ip->i_next_unlinked) || - next_agino == ip->i_next_unlinked) - return -EFSCORRUPTED; - - if (ip->i_next_unlinked == target_agino) { - *ipp = ip; - return 0; - } - next_agino = ip->i_next_unlinked; - } - return -EFSCORRUPTED; -} - static int xfs_iunlink_remove_inode( struct xfs_trans *tp, @@ -2326,51 +2116,33 @@ xfs_iunlink_remove_inode( return error; /* - * If there was a backref pointing from the next inode back to this - * one, remove it because we've removed this inode from the list. - * - * Later, if this inode was in the middle of the list we'll update - * this inode's backref to point from the next inode. + * Update the prev pointer in the next inode to point back to previous + * inode in the chain. */ - if (next_agino != NULLAGINO) { - error = xfs_iunlink_change_backref(pag, next_agino, NULLAGINO); - if (error) - return error; - } + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); + if (error) + return error; if (head_agino != agino) { struct xfs_inode *prev_ip; - error = xfs_iunlink_lookup_prev(pag, head_agino, agino, - &prev_ip); - if (error) - return error; - - /* Point the previous inode on the list to the next inode. */ - error = xfs_iunlink_update_inode(tp, prev_ip, pag, next_agino, - NULL); - if (error) - return error; + prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); + if (!prev_ip) + return -EFSCORRUPTED; + error = xfs_iunlink_update_inode(tp, prev_ip, pag, + ip->i_next_unlinked, NULL); prev_ip->i_next_unlinked = ip->i_next_unlinked; - ip->i_next_unlinked = NULLAGINO; - - /* - * Now we deal with the backref for this inode. If this inode - * pointed at a real inode, change the backref that pointed to - * us to point to our old next. If this inode was the end of - * the list, delete the backref that pointed to us. Note that - * change_backref takes care of deleting the backref if - * next_agino is NULLAGINO. - */ - return xfs_iunlink_change_backref(agibp->b_pag, agino, - next_agino); + } else { + /* Point the head of the list to the next unlinked inode. */ + error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, + ip->i_next_unlinked); } - /* Point the head of the list to the next unlinked inode. */ ip->i_next_unlinked = NULLAGINO; - return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, - next_agino); + ip->i_prev_unlinked = NULLAGINO; + return error; } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8e2a33c6cbe2..8d8cce61e5ba 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -70,6 +70,7 @@ typedef struct xfs_inode { /* unlinked list pointers */ xfs_agino_t i_next_unlinked; + xfs_agino_t i_prev_unlinked; /* VFS inode */ struct inode i_vnode; /* embedded VFS inode */ @@ -508,9 +509,6 @@ extern struct kmem_cache *xfs_inode_cache; bool xfs_inode_needs_inactive(struct xfs_inode *ip); -int xfs_iunlink_init(struct xfs_perag *pag); -void xfs_iunlink_destroy(struct xfs_perag *pag); - void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 8d28487813b0..9e0e7ff76e02 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2674,28 +2674,50 @@ xlog_recover_iunlink_bucket( int bucket) { struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *prev_ip = NULL; struct xfs_inode *ip; - xfs_agino_t agino; + xfs_agino_t prev_agino, agino; + int error = 0; agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - int error; - error = xfs_iget(mp, NULL, XFS_AGINO_TO_INO(mp, pag->pag_agno, agino), 0, 0, &ip); if (error) - return error;; + break; ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); xfs_iflags_clear(ip, XFS_IRECOVERY); agino = ip->i_next_unlinked; - xfs_irele(ip); - cond_resched(); + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); + + /* + * Ensure the inode is removed from the unlinked list + * before we continue so that it won't race with + * building the in-memory list here. This could be + * serialised with the agibp lock, but that just + * serialises via lockstepping and it's much simpler + * just to flush the inodegc queue and wait for it to + * complete. + */ + xfs_inodegc_flush(mp); + } + + prev_agino = agino; + prev_ip = ip; } - return 0; + + if (prev_ip) { + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); + } + xfs_inodegc_flush(mp); + return error; } /* -- cgit v1.2.3 From 5301f87013145a874cda4ae008b6fcc2b810a721 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:46:46 +1000 Subject: xfs: clean up xfs_iunlink_update_inode() We no longer need to have this function return the previous next agino value from the on-disk inode as we have it in the in-core inode now. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 63af7680cf73..550574b63b9a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1935,13 +1935,12 @@ xfs_iunlink_update_dinode( } /* Set an in-core inode's unlinked pointer and return the old value. */ -STATIC int +static int xfs_iunlink_update_inode( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_perag *pag, - xfs_agino_t next_agino, - xfs_agino_t *old_next_agino) + xfs_agino_t next_agino) { struct xfs_mount *mp = tp->t_mountp; struct xfs_dinode *dip; @@ -1971,8 +1970,6 @@ xfs_iunlink_update_inode( * current pointer is the same as the new value, unless we're * terminating the list. */ - if (old_next_agino) - *old_next_agino = old_value; if (old_value == next_agino) { if (next_agino != NULLAGINO) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, @@ -2026,17 +2023,13 @@ xfs_iunlink_insert_inode( return error; if (next_agino != NULLAGINO) { - xfs_agino_t old_agino; - /* * There is already another inode in the bucket, so point this * inode to the current head of the list. */ - error = xfs_iunlink_update_inode(tp, ip, pag, next_agino, - &old_agino); + error = xfs_iunlink_update_inode(tp, ip, pag, next_agino); if (error) return error; - ASSERT(old_agino == NULLAGINO); ip->i_next_unlinked = next_agino; } @@ -2088,7 +2081,6 @@ xfs_iunlink_remove_inode( struct xfs_mount *mp = tp->t_mountp; struct xfs_agi *agi = agibp->b_addr; xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); - xfs_agino_t next_agino; xfs_agino_t head_agino; short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; int error; @@ -2111,7 +2103,7 @@ xfs_iunlink_remove_inode( * the old pointer value so that we can update whatever was previous * to us in the list to point to whatever was next in the list. */ - error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO, &next_agino); + error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO); if (error) return error; @@ -2132,7 +2124,7 @@ xfs_iunlink_remove_inode( return -EFSCORRUPTED; error = xfs_iunlink_update_inode(tp, prev_ip, pag, - ip->i_next_unlinked, NULL); + ip->i_next_unlinked); prev_ip->i_next_unlinked = ip->i_next_unlinked; } else { /* Point the head of the list to the next unlinked inode. */ -- cgit v1.2.3 From 062efdb0803adac3fad039d681789c5e01818bef Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:46:59 +1000 Subject: xfs: combine iunlink inode update functions Combine the logging of the inode unlink list update into the calling function that looks up the buffer we end up logging. These do not need to be separate functions as they are both short, simple operations and there's only a single call path through them. This new function will end up being the core of the iunlink log item processing... Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 550574b63b9a..515878399dda 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1905,38 +1905,9 @@ xfs_iunlink_update_bucket( return 0; } -/* Set an on-disk inode's next_unlinked pointer. */ -STATIC void -xfs_iunlink_update_dinode( - struct xfs_trans *tp, - struct xfs_perag *pag, - xfs_agino_t agino, - struct xfs_buf *ibp, - struct xfs_dinode *dip, - struct xfs_imap *imap, - xfs_agino_t next_agino) -{ - struct xfs_mount *mp = tp->t_mountp; - int offset; - - ASSERT(xfs_verify_agino_or_null(pag, next_agino)); - - trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, agino, - be32_to_cpu(dip->di_next_unlinked), next_agino); - - dip->di_next_unlinked = cpu_to_be32(next_agino); - offset = imap->im_boffset + - offsetof(struct xfs_dinode, di_next_unlinked); - - /* need to recalc the inode CRC if appropriate */ - xfs_dinode_calc_crc(mp, dip); - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); -} - /* Set an in-core inode's unlinked pointer and return the old value. */ static int -xfs_iunlink_update_inode( +xfs_iunlink_log_inode( struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_perag *pag, @@ -1946,6 +1917,7 @@ xfs_iunlink_update_inode( struct xfs_dinode *dip; struct xfs_buf *ibp; xfs_agino_t old_value; + int offset; int error; ASSERT(xfs_verify_agino_or_null(pag, next_agino)); @@ -1979,9 +1951,17 @@ xfs_iunlink_update_inode( goto out; } - /* Ok, update the new pointer. */ - xfs_iunlink_update_dinode(tp, pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - ibp, dip, &ip->i_imap, next_agino); + trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, + XFS_INO_TO_AGINO(mp, ip->i_ino), + be32_to_cpu(dip->di_next_unlinked), next_agino); + + dip->di_next_unlinked = cpu_to_be32(next_agino); + offset = ip->i_imap.im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); return 0; out: xfs_trans_brelse(tp, ibp); @@ -2027,7 +2007,7 @@ xfs_iunlink_insert_inode( * There is already another inode in the bucket, so point this * inode to the current head of the list. */ - error = xfs_iunlink_update_inode(tp, ip, pag, next_agino); + error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); if (error) return error; ip->i_next_unlinked = next_agino; @@ -2103,7 +2083,7 @@ xfs_iunlink_remove_inode( * the old pointer value so that we can update whatever was previous * to us in the list to point to whatever was next in the list. */ - error = xfs_iunlink_update_inode(tp, ip, pag, NULLAGINO); + error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); if (error) return error; @@ -2123,7 +2103,7 @@ xfs_iunlink_remove_inode( if (!prev_ip) return -EFSCORRUPTED; - error = xfs_iunlink_update_inode(tp, prev_ip, pag, + error = xfs_iunlink_log_inode(tp, prev_ip, pag, ip->i_next_unlinked); prev_ip->i_next_unlinked = ip->i_next_unlinked; } else { -- cgit v1.2.3 From fad743d7cd8bd92d03c09e71f29eace860f50415 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:47:26 +1000 Subject: xfs: add log item precommit operation For inodes that are dirty, we have an attached cluster buffer that we want to use to track the dirty inode through the AIL. Unfortunately, locking the cluster buffer and adding it to the transaction when the inode is first logged in a transaction leads to buffer lock ordering inversions. The specific problem is ordering against the AGI buffer. When modifying unlinked lists, the buffer lock order is AGI -> inode cluster buffer as the AGI buffer lock serialises all access to the unlinked lists. Unfortunately, functionality like xfs_droplink() logs the inode before calling xfs_iunlink(), as do various directory manipulation functions. The inode can be logged way down in the stack as far as the bmapi routines and hence, without a major rewrite of lots of APIs there's no way we can avoid the inode being logged by something until after the AGI has been logged. As we are going to be using ordered buffers for inode AIL tracking, there isn't a need to actually lock that buffer against modification as all the modifications are captured by logging the inode item itself. Hence we don't actually need to join the cluster buffer into the transaction until just before it is committed. This means we do not perturb any of the existing buffer lock orders in transactions, and the inode cluster buffer is always locked last in a transaction that doesn't otherwise touch inode cluster buffers. We do this by introducing a precommit log item method. This commit just introduces the mechanism; the inode item implementation is in followup commits. The precommit items need to be sorted into consistent order as we may be locking multiple items here. Hence if we have two dirty inodes in cluster buffers A and B, and some other transaction has two separate dirty inodes in the same cluster buffers, locking them in different orders opens us up to ABBA deadlocks. Hence we sort the items on the transaction based on the presence of a sort log item method. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_icache.c | 1 + fs/xfs/xfs_trans.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trans.h | 6 ++-- 3 files changed, 96 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7f4cc341aacc..aef4097ffd3e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -914,6 +914,7 @@ reclaim: ip->i_checked = 0; spin_unlock(&ip->i_flags_lock); + ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 82cf0189c0db..c49a61a9757d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -844,6 +844,90 @@ xfs_trans_committed_bulk( spin_unlock(&ailp->ail_lock); } +/* + * Sort transaction items prior to running precommit operations. This will + * attempt to order the items such that they will always be locked in the same + * order. Items that have no sort function are moved to the end of the list + * and so are locked last. + * + * This may need refinement as different types of objects add sort functions. + * + * Function is more complex than it needs to be because we are comparing 64 bit + * values and the function only returns 32 bit values. + */ +static int +xfs_trans_precommit_sort( + void *unused_arg, + const struct list_head *a, + const struct list_head *b) +{ + struct xfs_log_item *lia = container_of(a, + struct xfs_log_item, li_trans); + struct xfs_log_item *lib = container_of(b, + struct xfs_log_item, li_trans); + int64_t diff; + + /* + * If both items are non-sortable, leave them alone. If only one is + * sortable, move the non-sortable item towards the end of the list. + */ + if (!lia->li_ops->iop_sort && !lib->li_ops->iop_sort) + return 0; + if (!lia->li_ops->iop_sort) + return 1; + if (!lib->li_ops->iop_sort) + return -1; + + diff = lia->li_ops->iop_sort(lia) - lib->li_ops->iop_sort(lib); + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +/* + * Run transaction precommit functions. + * + * If there is an error in any of the callouts, then stop immediately and + * trigger a shutdown to abort the transaction. There is no recovery possible + * from errors at this point as the transaction is dirty.... + */ +static int +xfs_trans_run_precommits( + struct xfs_trans *tp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item *lip, *n; + int error = 0; + + /* + * Sort the item list to avoid ABBA deadlocks with other transactions + * running precommit operations that lock multiple shared items such as + * inode cluster buffers. + */ + list_sort(NULL, &tp->t_items, xfs_trans_precommit_sort); + + /* + * Precommit operations can remove the log item from the transaction + * if the log item exists purely to delay modifications until they + * can be ordered against other operations. Hence we have to use + * list_for_each_entry_safe() here. + */ + list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) { + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) + continue; + if (lip->li_ops->iop_precommit) { + error = lip->li_ops->iop_precommit(tp, lip); + if (error) + break; + } + } + if (error) + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return error; +} + /* * Commit the given transaction to the log. * @@ -869,6 +953,13 @@ __xfs_trans_commit( trace_xfs_trans_commit(tp, _RET_IP_); + error = xfs_trans_run_precommits(tp); + if (error) { + if (tp->t_flags & XFS_TRANS_PERM_LOG_RES) + xfs_defer_cancel(tp); + goto out_unreserve; + } + /* * Finish deferred items on final commit. Only permanent transactions * should ever have deferred ops. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9561f193e7e1..64062e3b788b 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -71,10 +71,12 @@ struct xfs_item_ops { void (*iop_format)(struct xfs_log_item *, struct xfs_log_vec *); void (*iop_pin)(struct xfs_log_item *); void (*iop_unpin)(struct xfs_log_item *, int remove); - uint (*iop_push)(struct xfs_log_item *, struct list_head *); + uint64_t (*iop_sort)(struct xfs_log_item *lip); + int (*iop_precommit)(struct xfs_trans *tp, struct xfs_log_item *lip); void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq); - void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); + void (*iop_release)(struct xfs_log_item *); int (*iop_recover)(struct xfs_log_item *lip, struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); -- cgit v1.2.3 From 784eb7d8dd4163b82a19b914f76b2834a58a3e4c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 11:47:42 +1000 Subject: xfs: add in-memory iunlink log item Now that we have a clean operation to update the di_next_unlinked field of inode cluster buffers, we can easily defer this operation to transaction commit time so we can order the inode cluster buffer locking consistently. To do this, we introduce a new in-memory log item to track the unlinked list item modification that we are going to make. This follows the same observations as the in-memory double linked list used to track unlinked inodes in that the inodes on the list are pinned in memory and cannot go away, and hence we can simply reference them for the duration of the transaction without needing to take active references or pin them or look them up. This allows us to pass the xfs_inode to the transaction commit code along with the modification to be made, and then order the logged modifications via the ->iop_sort and ->iop_precommit operations for the new log item type. As this is an in-memory log item, it doesn't have formatting, CIL or AIL operational hooks - it exists purely to run the inode unlink modifications and is then removed from the transaction item list and freed once the precommit operation has run. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/xfs_inode.c | 64 +---------------- fs/xfs/xfs_iunlink_item.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_iunlink_item.h | 27 +++++++ fs/xfs/xfs_super.c | 10 +++ 5 files changed, 219 insertions(+), 63 deletions(-) create mode 100644 fs/xfs/xfs_iunlink_item.c create mode 100644 fs/xfs/xfs_iunlink_item.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b056cfc6398e..1131dd01e4fe 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -106,6 +106,7 @@ xfs-y += xfs_log.o \ xfs_icreate_item.o \ xfs_inode_item.o \ xfs_inode_item_recover.o \ + xfs_iunlink_item.o \ xfs_refcount_item.o \ xfs_rmap_item.o \ xfs_log_recover.o \ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 515878399dda..ace537a17d0d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -20,6 +20,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_inode_item.h" +#include "xfs_iunlink_item.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" @@ -1905,69 +1906,6 @@ xfs_iunlink_update_bucket( return 0; } -/* Set an in-core inode's unlinked pointer and return the old value. */ -static int -xfs_iunlink_log_inode( - struct xfs_trans *tp, - struct xfs_inode *ip, - struct xfs_perag *pag, - xfs_agino_t next_agino) -{ - struct xfs_mount *mp = tp->t_mountp; - struct xfs_dinode *dip; - struct xfs_buf *ibp; - xfs_agino_t old_value; - int offset; - int error; - - ASSERT(xfs_verify_agino_or_null(pag, next_agino)); - - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); - if (error) - return error; - dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); - - /* Make sure the old pointer isn't garbage. */ - old_value = be32_to_cpu(dip->di_next_unlinked); - if (old_value != ip->i_next_unlinked || - !xfs_verify_agino_or_null(pag, old_value)) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, - sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - goto out; - } - - /* - * Since we're updating a linked list, we should never find that the - * current pointer is the same as the new value, unless we're - * terminating the list. - */ - if (old_value == next_agino) { - if (next_agino != NULLAGINO) { - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, - dip, sizeof(*dip), __this_address); - error = -EFSCORRUPTED; - } - goto out; - } - - trace_xfs_iunlink_update_dinode(mp, pag->pag_agno, - XFS_INO_TO_AGINO(mp, ip->i_ino), - be32_to_cpu(dip->di_next_unlinked), next_agino); - - dip->di_next_unlinked = cpu_to_be32(next_agino); - offset = ip->i_imap.im_boffset + - offsetof(struct xfs_dinode, di_next_unlinked); - - xfs_dinode_calc_crc(mp, dip); - xfs_trans_inode_buf(tp, ibp); - xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); - return 0; -out: - xfs_trans_brelse(tp, ibp); - return error; -} - static int xfs_iunlink_insert_inode( struct xfs_trans *tp, diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c new file mode 100644 index 000000000000..43005ce8bd48 --- /dev/null +++ b/fs/xfs/xfs_iunlink_item.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2022, Red Hat, Inc. + * All Rights Reserved. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_ag.h" +#include "xfs_iunlink_item.h" +#include "xfs_trace.h" +#include "xfs_error.h" + +struct kmem_cache *xfs_iunlink_cache; + +static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_iunlink_item, item); +} + +static void +xfs_iunlink_item_release( + struct xfs_log_item *lip) +{ + struct xfs_iunlink_item *iup = IUL_ITEM(lip); + + xfs_perag_put(iup->pag); + kmem_cache_free(xfs_iunlink_cache, IUL_ITEM(lip)); +} + + +static uint64_t +xfs_iunlink_item_sort( + struct xfs_log_item *lip) +{ + return IUL_ITEM(lip)->ip->i_ino; +} + +/* + * Look up the inode cluster buffer and log the on-disk unlinked inode change + * we need to make. + */ +static int +xfs_iunlink_log_dinode( + struct xfs_trans *tp, + struct xfs_iunlink_item *iup) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *ip = iup->ip; + struct xfs_dinode *dip; + struct xfs_buf *ibp; + int offset; + int error; + + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &ibp); + if (error) + return error; + /* + * Don't log the unlinked field on stale buffers as this may be the + * transaction that frees the inode cluster and relogging the buffer + * here will incorrectly remove the stale state. + */ + if (ibp->b_flags & XBF_STALE) + goto out; + + dip = xfs_buf_offset(ibp, ip->i_imap.im_boffset); + + /* Make sure the old pointer isn't garbage. */ + if (be32_to_cpu(dip->di_next_unlinked) != iup->old_agino) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip, + sizeof(*dip), __this_address); + error = -EFSCORRUPTED; + goto out; + } + + trace_xfs_iunlink_update_dinode(mp, iup->pag->pag_agno, + XFS_INO_TO_AGINO(mp, ip->i_ino), + be32_to_cpu(dip->di_next_unlinked), iup->next_agino); + + dip->di_next_unlinked = cpu_to_be32(iup->next_agino); + offset = ip->i_imap.im_boffset + + offsetof(struct xfs_dinode, di_next_unlinked); + + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); + xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); + return 0; +out: + xfs_trans_brelse(tp, ibp); + return error; +} + +/* + * On precommit, we grab the inode cluster buffer for the inode number we were + * passed, then update the next unlinked field for that inode in the buffer and + * log the buffer. This ensures that the inode cluster buffer was logged in the + * correct order w.r.t. other inode cluster buffers. We can then remove the + * iunlink item from the transaction and release it as it is has now served it's + * purpose. + */ +static int +xfs_iunlink_item_precommit( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_iunlink_item *iup = IUL_ITEM(lip); + int error; + + error = xfs_iunlink_log_dinode(tp, iup); + list_del(&lip->li_trans); + xfs_iunlink_item_release(lip); + return error; +} + +static const struct xfs_item_ops xfs_iunlink_item_ops = { + .iop_release = xfs_iunlink_item_release, + .iop_sort = xfs_iunlink_item_sort, + .iop_precommit = xfs_iunlink_item_precommit, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the iunlink item. + */ +int +xfs_iunlink_log_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_perag *pag, + xfs_agino_t next_agino) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_iunlink_item *iup; + + ASSERT(xfs_verify_agino_or_null(pag, next_agino)); + ASSERT(xfs_verify_agino_or_null(pag, ip->i_next_unlinked)); + + /* + * Since we're updating a linked list, we should never find that the + * current pointer is the same as the new value, unless we're + * terminating the list. + */ + if (ip->i_next_unlinked == next_agino) { + if (next_agino != NULLAGINO) + return -EFSCORRUPTED; + return 0; + } + + iup = kmem_cache_zalloc(xfs_iunlink_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(mp, &iup->item, XFS_LI_IUNLINK, + &xfs_iunlink_item_ops); + + iup->ip = ip; + iup->next_agino = next_agino; + iup->old_agino = ip->i_next_unlinked; + + atomic_inc(&pag->pag_ref); + iup->pag = pag; + + xfs_trans_add_item(tp, &iup->item); + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &iup->item.li_flags); + return 0; +} + diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h new file mode 100644 index 000000000000..c793cdcaccde --- /dev/null +++ b/fs/xfs/xfs_iunlink_item.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020-2022, Red Hat, Inc. + * All Rights Reserved. + */ +#ifndef XFS_IUNLINK_ITEM_H +#define XFS_IUNLINK_ITEM_H 1 + +struct xfs_trans; +struct xfs_inode; +struct xfs_perag; + +/* in memory log item structure */ +struct xfs_iunlink_item { + struct xfs_log_item item; + struct xfs_inode *ip; + struct xfs_perag *pag; + xfs_agino_t next_agino; + xfs_agino_t old_agino; +}; + +extern struct kmem_cache *xfs_iunlink_cache; + +int xfs_iunlink_log_inode(struct xfs_trans *tp, struct xfs_inode *ip, + struct xfs_perag *pag, xfs_agino_t next_agino); + +#endif /* XFS_IUNLINK_ITEM_H */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aa977c7ea370..315cca5f9322 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -40,6 +40,7 @@ #include "xfs_defer.h" #include "xfs_attr_item.h" #include "xfs_xattr.h" +#include "xfs_iunlink_item.h" #include #include @@ -2096,8 +2097,16 @@ xfs_init_caches(void) if (!xfs_attri_cache) goto out_destroy_attrd_cache; + xfs_iunlink_cache = kmem_cache_create("xfs_iul_item", + sizeof(struct xfs_iunlink_item), + 0, 0, NULL); + if (!xfs_iunlink_cache) + goto out_destroy_attri_cache; + return 0; + out_destroy_attri_cache: + kmem_cache_destroy(xfs_attri_cache); out_destroy_attrd_cache: kmem_cache_destroy(xfs_attrd_cache); out_destroy_bui_cache: @@ -2148,6 +2157,7 @@ xfs_destroy_caches(void) * destroy caches. */ rcu_barrier(); + kmem_cache_destroy(xfs_iunlink_cache); kmem_cache_destroy(xfs_attri_cache); kmem_cache_destroy(xfs_attrd_cache); kmem_cache_destroy(xfs_bui_cache); -- cgit v1.2.3 From de67dc575434dca8d60b1e181ed5dd296392ffce Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 12:02:46 +1000 Subject: xfs: break up xfs_buf_find() into individual pieces xfs_buf_find() is made up of three main parts: lookup, insert and locking. The interactions with xfs_buf_get_map() require it to be called twice - once for a pure lookup, and again on lookup failure so the insert path can be run. We want to simplify this down a lot, so split it into a fast path lookup, a slow path insert and a "lock the found buffer" helper. This will then let us integrate these operations more effectively into xfs_buf_get_map() in future patches. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 158 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 104 insertions(+), 54 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 143e1c70df5d..9a44048d352b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -503,6 +503,98 @@ xfs_buf_hash_destroy( rhashtable_destroy(&pag->pag_buf_hash); } +static int +xfs_buf_map_verify( + struct xfs_buftarg *btp, + struct xfs_buf_map *map) +{ + xfs_daddr_t eofs; + + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); + + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (map->bm_bn < 0 || map->bm_bn >= eofs) { + xfs_alert(btp->bt_mount, + "%s: daddr 0x%llx out of range, EOFS 0x%llx", + __func__, map->bm_bn, eofs); + WARN_ON(1); + return -EFSCORRUPTED; + } + return 0; +} + +static int +xfs_buf_find_lock( + struct xfs_buf *bp, + xfs_buf_flags_t flags) +{ + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) { + XFS_STATS_INC(bp->b_mount, xb_busy_locked); + xfs_buf_rele(bp); + return -EAGAIN; + } + xfs_buf_lock(bp); + XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); + } + + /* + * if the buffer is stale, clear all the external state associated with + * it. We need to keep flags such as how we allocated the buffer memory + * intact here. + */ + if (bp->b_flags & XBF_STALE) { + ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_ops = NULL; + } + return 0; +} + +static inline struct xfs_buf * +xfs_buf_lookup( + struct xfs_perag *pag, + struct xfs_buf_map *map) +{ + struct xfs_buf *bp; + + bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); + if (!bp) + return NULL; + atomic_inc(&bp->b_hold); + return bp; +} + +/* + * Insert the new_bp into the hash table. This consumes the perag reference + * taken for the lookup. + */ +static int +xfs_buf_find_insert( + struct xfs_buftarg *btp, + struct xfs_perag *pag, + struct xfs_buf *new_bp) +{ + /* No match found */ + if (!new_bp) { + xfs_perag_put(pag); + XFS_STATS_INC(btp->bt_mount, xb_miss_locked); + return -ENOENT; + } + + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, + xfs_buf_hash_params); + return 0; +} + /* * Look up a buffer in the buffer cache and return it referenced and locked * in @found_bp. @@ -533,7 +625,7 @@ xfs_buf_find( struct xfs_perag *pag; struct xfs_buf *bp; struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; - xfs_daddr_t eofs; + int error; int i; *found_bp = NULL; @@ -541,47 +633,22 @@ xfs_buf_find( for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; - /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); - - /* - * Corrupted block numbers can get through to here, unfortunately, so we - * have to check that the buffer falls within the filesystem bounds. - */ - eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { - xfs_alert(btp->bt_mount, - "%s: daddr 0x%llx out of range, EOFS 0x%llx", - __func__, cmap.bm_bn, eofs); - WARN_ON(1); - return -EFSCORRUPTED; - } + error = xfs_buf_map_verify(btp, &cmap); + if (error) + return error; pag = xfs_perag_get(btp->bt_mount, xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, - xfs_buf_hash_params); - if (bp) { - atomic_inc(&bp->b_hold); + bp = xfs_buf_lookup(pag, &cmap); + if (bp) goto found; - } - - /* No match found */ - if (!new_bp) { - XFS_STATS_INC(btp->bt_mount, xb_miss_locked); - spin_unlock(&pag->pag_buf_lock); - xfs_perag_put(pag); - return -ENOENT; - } - /* the buffer keeps the perag reference until it is freed */ - new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, - xfs_buf_hash_params); + error = xfs_buf_find_insert(btp, pag, new_bp); spin_unlock(&pag->pag_buf_lock); + if (error) + return error; *found_bp = new_bp; return 0; @@ -589,26 +656,9 @@ found: spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); - if (!xfs_buf_trylock(bp)) { - if (flags & XBF_TRYLOCK) { - xfs_buf_rele(bp); - XFS_STATS_INC(btp->bt_mount, xb_busy_locked); - return -EAGAIN; - } - xfs_buf_lock(bp); - XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); - } - - /* - * if the buffer is stale, clear all the external state associated with - * it. We need to keep flags such as how we allocated the buffer memory - * intact here. - */ - if (bp->b_flags & XBF_STALE) { - ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; - bp->b_ops = NULL; - } + error = xfs_buf_find_lock(bp, flags); + if (error) + return error; trace_xfs_buf_find(bp, flags, _RET_IP_); XFS_STATS_INC(btp->bt_mount, xb_get_locked); -- cgit v1.2.3 From 348000804a0f4dea74219a927e081d6e7dee792f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 12:04:31 +1000 Subject: xfs: merge xfs_buf_find() and xfs_buf_get_map() Now that we factored xfs_buf_find(), we can start separating into distinct fast and slow paths from xfs_buf_get_map(). We start by moving the lookup map and perag setup to _get_map(), and then move all the specifics of the fast path lookup into xfs_buf_lookup() and call it directly from _get_map(). We the move all the slow path code to xfs_buf_find_insert(), which is now also called directly from _get_map(). As such, xfs_buf_find() now goes away. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 202 +++++++++++++++++++++++++------------------------------ 1 file changed, 93 insertions(+), 109 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 9a44048d352b..81ca951b451a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -537,7 +537,6 @@ xfs_buf_find_lock( if (!xfs_buf_trylock(bp)) { if (flags & XBF_TRYLOCK) { XFS_STATS_INC(bp->b_mount, xb_busy_locked); - xfs_buf_rele(bp); return -EAGAIN; } xfs_buf_lock(bp); @@ -557,113 +556,97 @@ xfs_buf_find_lock( return 0; } -static inline struct xfs_buf * +static inline int xfs_buf_lookup( struct xfs_perag *pag, - struct xfs_buf_map *map) + struct xfs_buf_map *map, + xfs_buf_flags_t flags, + struct xfs_buf **bpp) { struct xfs_buf *bp; + int error; + spin_lock(&pag->pag_buf_lock); bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); - if (!bp) - return NULL; + if (!bp) { + spin_unlock(&pag->pag_buf_lock); + return -ENOENT; + } atomic_inc(&bp->b_hold); - return bp; -} + spin_unlock(&pag->pag_buf_lock); -/* - * Insert the new_bp into the hash table. This consumes the perag reference - * taken for the lookup. - */ -static int -xfs_buf_find_insert( - struct xfs_buftarg *btp, - struct xfs_perag *pag, - struct xfs_buf *new_bp) -{ - /* No match found */ - if (!new_bp) { - xfs_perag_put(pag); - XFS_STATS_INC(btp->bt_mount, xb_miss_locked); - return -ENOENT; + error = xfs_buf_find_lock(bp, flags); + if (error) { + xfs_buf_rele(bp); + return error; } - /* the buffer keeps the perag reference until it is freed */ - new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, - xfs_buf_hash_params); + trace_xfs_buf_find(bp, flags, _RET_IP_); + *bpp = bp; return 0; } /* - * Look up a buffer in the buffer cache and return it referenced and locked - * in @found_bp. - * - * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the - * cache. - * - * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return - * -EAGAIN if we fail to lock it. - * - * Return values are: - * -EFSCORRUPTED if have been supplied with an invalid address - * -EAGAIN on trylock failure - * -ENOENT if we fail to find a match and @new_bp was NULL - * 0, with @found_bp: - * - @new_bp if we inserted it into the cache - * - the buffer we found and locked. + * Insert the new_bp into the hash table. This consumes the perag reference + * taken for the lookup regardless of the result of the insert. */ static int -xfs_buf_find( +xfs_buf_find_insert( struct xfs_buftarg *btp, + struct xfs_perag *pag, + struct xfs_buf_map *cmap, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - struct xfs_buf *new_bp, - struct xfs_buf **found_bp) + struct xfs_buf **bpp) { - struct xfs_perag *pag; + struct xfs_buf *new_bp; struct xfs_buf *bp; - struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; int error; - int i; - - *found_bp = NULL; - - for (i = 0; i < nmaps; i++) - cmap.bm_len += map[i].bm_len; - error = xfs_buf_map_verify(btp, &cmap); + error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) - return error; + goto out_drop_pag; - pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); + /* + * For buffers that fit entirely within a single page, first attempt to + * allocate the memory from the heap to minimise memory usage. If we + * can't get heap memory for these small buffers, we fall back to using + * the page allocator. + */ + if (BBTOB(new_bp->b_length) >= PAGE_SIZE || + xfs_buf_alloc_kmem(new_bp, flags) < 0) { + error = xfs_buf_alloc_pages(new_bp, flags); + if (error) + goto out_free_buf; + } spin_lock(&pag->pag_buf_lock); - bp = xfs_buf_lookup(pag, &cmap); - if (bp) - goto found; + bp = rhashtable_lookup(&pag->pag_buf_hash, cmap, xfs_buf_hash_params); + if (bp) { + atomic_inc(&bp->b_hold); + spin_unlock(&pag->pag_buf_lock); + error = xfs_buf_find_lock(bp, flags); + if (error) + xfs_buf_rele(bp); + else + *bpp = bp; + goto out_free_buf; + } - error = xfs_buf_find_insert(btp, pag, new_bp); + /* The buffer keeps the perag reference until it is freed. */ + new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); - if (error) - return error; - *found_bp = new_bp; + *bpp = new_bp; return 0; -found: - spin_unlock(&pag->pag_buf_lock); +out_free_buf: + xfs_buf_free(new_bp); +out_drop_pag: xfs_perag_put(pag); - - error = xfs_buf_find_lock(bp, flags); - if (error) - return error; - - trace_xfs_buf_find(bp, flags, _RET_IP_); - XFS_STATS_INC(btp->bt_mount, xb_get_locked); - *found_bp = bp; - return 0; + return error; } /* @@ -673,54 +656,54 @@ found: */ int xfs_buf_get_map( - struct xfs_buftarg *target, + struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { - struct xfs_buf *bp; - struct xfs_buf *new_bp; + struct xfs_perag *pag; + struct xfs_buf *bp = NULL; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; int error; + int i; - *bpp = NULL; - error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); - if (!error) - goto found; - if (error != -ENOENT) - return error; - if (flags & XBF_INCORE) - return -ENOENT; + for (i = 0; i < nmaps; i++) + cmap.bm_len += map[i].bm_len; - error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp); + error = xfs_buf_map_verify(btp, &cmap); if (error) return error; - /* - * For buffers that fit entirely within a single page, first attempt to - * allocate the memory from the heap to minimise memory usage. If we - * can't get heap memory for these small buffers, we fall back to using - * the page allocator. - */ - if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { - error = xfs_buf_alloc_pages(new_bp, flags); - if (error) - goto out_free_buf; - } + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); - if (error) - goto out_free_buf; + error = xfs_buf_lookup(pag, &cmap, flags, &bp); + if (error && error != -ENOENT) + goto out_put_perag; - if (bp != new_bp) - xfs_buf_free(new_bp); + /* cache hits always outnumber misses by at least 10:1 */ + if (unlikely(!bp)) { + XFS_STATS_INC(btp->bt_mount, xb_miss_locked); + + if (flags & XBF_INCORE) + goto out_put_perag; + + /* xfs_buf_find_insert() consumes the perag reference. */ + error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, + flags, &bp); + if (error) + return error; + } else { + XFS_STATS_INC(btp->bt_mount, xb_get_locked); + xfs_perag_put(pag); + } -found: + /* We do not hold a perag reference anymore. */ if (!bp->b_addr) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - xfs_warn_ratelimited(target->bt_mount, + xfs_warn_ratelimited(btp->bt_mount, "%s: failed to map %u pages", __func__, bp->b_page_count); xfs_buf_relse(bp); @@ -735,12 +718,13 @@ found: if (!(flags & XBF_READ)) xfs_buf_ioerror(bp, 0); - XFS_STATS_INC(target->bt_mount, xb_get); + XFS_STATS_INC(btp->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); *bpp = bp; return 0; -out_free_buf: - xfs_buf_free(new_bp); + +out_put_perag: + xfs_perag_put(pag); return error; } -- cgit v1.2.3 From d8d9bbb0ee6c79191b704d88c8ae712b89e0d2bb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 12:04:38 +1000 Subject: xfs: reduce the number of atomic when locking a buffer after lookup Avoid an extra atomic operation in the non-trylock case by only doing a trylock if the XBF_TRYLOCK flag is set. This follows the pattern in the IO path with NOWAIT semantics where the "trylock-fail-lock" path showed 5-10% reduced throughput compared to just using single lock call when not under NOWAIT conditions. So make that same change here, too. See commit 942491c9e6d6 ("xfs: fix AIM7 regression") for details. Signed-off-by: Dave Chinner [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 81ca951b451a..374c4e508b12 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -534,11 +534,12 @@ xfs_buf_find_lock( struct xfs_buf *bp, xfs_buf_flags_t flags) { - if (!xfs_buf_trylock(bp)) { - if (flags & XBF_TRYLOCK) { + if (flags & XBF_TRYLOCK) { + if (!xfs_buf_trylock(bp)) { XFS_STATS_INC(bp->b_mount, xb_busy_locked); return -EAGAIN; } + } else { xfs_buf_lock(bp); XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); } -- cgit v1.2.3 From 32dd4f9c506b1bf147c24cf05423cd893bc06e38 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 12:04:43 +1000 Subject: xfs: remove a superflous hash lookup when inserting new buffers Currently on the slow path insert we repeat the initial hash table lookup before we attempt the insert, resulting in a two traversals of the hash table to ensure the insert is valid. The rhashtable API provides a method for an atomic lookup and insert operation, so we can avoid one of the hash table traversals by using this method. Adapted from a large patch containing this optimisation by Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 374c4e508b12..1a6542e01bec 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -623,8 +623,15 @@ xfs_buf_find_insert( } spin_lock(&pag->pag_buf_lock); - bp = rhashtable_lookup(&pag->pag_buf_hash, cmap, xfs_buf_hash_params); + bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, xfs_buf_hash_params); + if (IS_ERR(bp)) { + error = PTR_ERR(bp); + spin_unlock(&pag->pag_buf_lock); + goto out_free_buf; + } if (bp) { + /* found an existing buffer */ atomic_inc(&bp->b_hold); spin_unlock(&pag->pag_buf_lock); error = xfs_buf_find_lock(bp, flags); @@ -635,10 +642,8 @@ xfs_buf_find_insert( goto out_free_buf; } - /* The buffer keeps the perag reference until it is freed. */ + /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, - xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); *bpp = new_bp; return 0; -- cgit v1.2.3 From 298f342245066309189d8637ca7339d56840c3e1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 14 Jul 2022 12:05:07 +1000 Subject: xfs: lockless buffer lookup Now that we have a standalone fast path for buffer lookup, we can easily convert it to use rcu lookups. When we continually hammer the buffer cache with trylock lookups, we end up with a huge amount of lock contention on the per-ag buffer hash locks: - 92.71% 0.05% [kernel] [k] xfs_inodegc_worker - 92.67% xfs_inodegc_worker - 92.13% xfs_inode_unlink - 91.52% xfs_inactive_ifree - 85.63% xfs_read_agi - 85.61% xfs_trans_read_buf_map - 85.59% xfs_buf_read_map - xfs_buf_get_map - 85.55% xfs_buf_find - 72.87% _raw_spin_lock - do_raw_spin_lock 71.86% __pv_queued_spin_lock_slowpath - 8.74% xfs_buf_rele - 7.88% _raw_spin_lock - 7.88% do_raw_spin_lock 7.63% __pv_queued_spin_lock_slowpath - 1.70% xfs_buf_trylock - 1.68% down_trylock - 1.41% _raw_spin_lock_irqsave - 1.39% do_raw_spin_lock __pv_queued_spin_lock_slowpath - 0.76% _raw_spin_unlock 0.75% do_raw_spin_unlock This is basically hammering the pag->pag_buf_lock from lots of CPUs doing trylocks at the same time. Most of the buffer trylock operations ultimately fail after we've done the lookup, so we're really hammering the buf hash lock whilst making no progress. We can also see significant spinlock traffic on the same lock just under normal operation when lots of tasks are accessing metadata from the same AG, so let's avoid all this by converting the lookup fast path to leverages the rhashtable's ability to do rcu protected lookups. We avoid races with the buffer release path by using atomic_inc_not_zero() on the buffer hold count. Any buffer that is in the LRU will have a non-zero count, thereby allowing the lockless fast path to be taken in most cache hit situations. If the buffer hold count is zero, then it is likely going through the release path so in that case we fall back to the existing lookup miss slow path. The slow path will then do an atomic lookup and insert under the buffer hash lock and hence serialise correctly against buffer release freeing the buffer. The use of rcu protected lookups means that buffer handles now need to be freed by RCU callbacks (same as inodes). We still free the buffer pages before the RCU callback - we won't be trying to access them at all on a buffer that has zero references - but we need the buffer handle itself to be present for the entire rcu protected read side to detect a zero hold count correctly. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 22 +++++++++++++++------- fs/xfs/xfs_buf.h | 1 + 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1a6542e01bec..6dac5583977f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -294,6 +294,16 @@ xfs_buf_free_pages( bp->b_flags &= ~_XBF_PAGES; } +static void +xfs_buf_free_callback( + struct callback_head *cb) +{ + struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); + + xfs_buf_free_maps(bp); + kmem_cache_free(xfs_buf_cache, bp); +} + static void xfs_buf_free( struct xfs_buf *bp) @@ -307,8 +317,7 @@ xfs_buf_free( else if (bp->b_flags & _XBF_KMEM) kmem_free(bp->b_addr); - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_cache, bp); + call_rcu(&bp->b_rcu, xfs_buf_free_callback); } static int @@ -567,14 +576,13 @@ xfs_buf_lookup( struct xfs_buf *bp; int error; - spin_lock(&pag->pag_buf_lock); + rcu_read_lock(); bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); - if (!bp) { - spin_unlock(&pag->pag_buf_lock); + if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { + rcu_read_unlock(); return -ENOENT; } - atomic_inc(&bp->b_hold); - spin_unlock(&pag->pag_buf_lock); + rcu_read_unlock(); error = xfs_buf_find_lock(bp, flags); if (error) { diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 58e9034d51bd..02b3c1635ec3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -196,6 +196,7 @@ struct xfs_buf { int b_last_error; const struct xfs_buf_ops *b_ops; + struct rcu_head b_rcu; }; /* Finding and Reading Buffers */ -- cgit v1.2.3 From 95ff0363f3f6ae70c21a0f2b0603e54438e5988b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Jul 2022 09:22:53 -0700 Subject: xfs: fix use-after-free in xattr node block inactivation The kernel build robot reported a UAF error while running xfs/433 (edited somewhat for brevity): BUG: KASAN: use-after-free in xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:214) xfs Read of size 4 at addr ffff88820ac2bd44 by task kworker/0:2/139 CPU: 0 PID: 139 Comm: kworker/0:2 Tainted: G S 5.19.0-rc2-00004-g7cf2b0f9611b #1 Hardware name: Hewlett-Packard p6-1451cx/2ADA, BIOS 8.15 02/05/2013 Workqueue: xfs-inodegc/sdb4 xfs_inodegc_worker [xfs] Call Trace: dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1)) print_address_description+0x1f/0x200 print_report.cold (mm/kasan/report.c:430) kasan_report (mm/kasan/report.c:162 mm/kasan/report.c:493) xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:214) xfs xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:296) xfs xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs process_one_work worker_thread kthread ret_from_fork Allocated by task 139: kasan_save_stack (mm/kasan/common.c:39) __kasan_slab_alloc (mm/kasan/common.c:45 mm/kasan/common.c:436 mm/kasan/common.c:469) kmem_cache_alloc (mm/slab.h:750 mm/slub.c:3214 mm/slub.c:3222 mm/slub.c:3229 mm/slub.c:3239) _xfs_buf_alloc (include/linux/instrumented.h:86 include/linux/atomic/atomic-instrumented.h:41 fs/xfs/xfs_buf.c:232) xfs xfs_buf_get_map (fs/xfs/xfs_buf.c:660) xfs xfs_buf_read_map (fs/xfs/xfs_buf.c:777) xfs xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:289) xfs xfs_da_read_buf (fs/xfs/libxfs/xfs_da_btree.c:2652) xfs xfs_da3_node_read (fs/xfs/libxfs/xfs_da_btree.c:392) xfs xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:272) xfs xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs process_one_work worker_thread kthread ret_from_fork Freed by task 139: kasan_save_stack (mm/kasan/common.c:39) kasan_set_track (mm/kasan/common.c:45) kasan_set_free_info (mm/kasan/generic.c:372) __kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328 mm/kasan/common.c:374) kmem_cache_free (mm/slub.c:1753 mm/slub.c:3507 mm/slub.c:3524) xfs_buf_rele (fs/xfs/xfs_buf.c:1040) xfs xfs_attr3_node_inactive (fs/xfs/xfs_attr_inactive.c:210) xfs xfs_attr3_root_inactive (fs/xfs/xfs_attr_inactive.c:296) xfs xfs_attr_inactive (fs/xfs/xfs_attr_inactive.c:371) xfs xfs_inactive (fs/xfs/xfs_inode.c:1781) xfs xfs_inodegc_worker (fs/xfs/xfs_icache.c:1837 fs/xfs/xfs_icache.c:1860) xfs process_one_work worker_thread kthread ret_from_fork I reproduced this for my own satisfaction, and got the same report, along with an extra morsel: The buggy address belongs to the object at ffff88802103a800 which belongs to the cache xfs_buf of size 432 The buggy address is located 396 bytes inside of 432-byte region [ffff88802103a800, ffff88802103a9b0) I tracked this code down to: error = xfs_trans_get_buf(*trans, mp->m_ddev_targp, child_blkno, XFS_FSB_TO_BB(mp, mp->m_attr_geo->fsbcount), 0, &child_bp); if (error) return error; error = bp->b_error; That doesn't look right -- I think this should be dereferencing child_bp, not bp. Looking through the codebase history, I think this was added by commit 2911edb653b9 ("xfs: remove the mappedbno argument to xfs_da_get_buf"), which replaced a call to xfs_da_get_buf with the current call to xfs_trans_get_buf. Not sure why we trans_brelse'd @bp earlier in the function, but I'm guessing it's to avoid pinning too many buffers in memory while we inactivate the bottom of the attr tree. Hence we now have to get the buffer back. I /think/ this was supposed to check child_bp->b_error and fail the rest of the invalidation if child_bp had experienced any kind of IO or corruption error. I bet the xfs_da3_node_read earlier in the loop will catch most cases of incoming on-disk corruption which makes this check mostly moot unless someone corrupts the buffer and the AIL pushes it out to disk while the buffer's unlocked. In the first case we'll never get to the bad check, and in the second case the AIL will shut down the log, at which point there's no reason to check b_error. Remove the check, and null out @bp to avoid this problem in the future. Cc: hch@lst.de Reported-by: kernel test robot Fixes: 2911edb653b9 ("xfs: remove the mappedbno argument to xfs_da_get_buf") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_inactive.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 0e83cab9cdde..895ba8b7a26c 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -158,6 +158,7 @@ xfs_attr3_node_inactive( } child_fsb = be32_to_cpu(ichdr.btree[0].before); xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + bp = NULL; /* * If this is the node level just above the leaves, simply loop @@ -211,12 +212,8 @@ xfs_attr3_node_inactive( &child_bp); if (error) return error; - error = bp->b_error; - if (error) { - xfs_trans_brelse(*trans, child_bp); - return error; - } xfs_trans_binval(*trans, child_bp); + child_bp = NULL; /* * If we're not done, re-read the parent to get the next @@ -233,6 +230,7 @@ xfs_attr3_node_inactive( bp->b_addr); child_fsb = be32_to_cpu(phdr.btree[i + 1].before); xfs_trans_brelse(*trans, bp); + bp = NULL; } /* * Atomically commit the whole invalidate stuff. -- cgit v1.2.3 From fdbae121b4369fe49eb5f8efbd23604ab4c50116 Mon Sep 17 00:00:00 2001 From: Xiaole He Date: Mon, 18 Jul 2022 10:13:47 -0700 Subject: xfs: fix comment for start time value of inode with bigtime enabled The 'ctime', 'mtime', and 'atime' for inode is the type of 'xfs_timestamp_t', which is a 64-bit type: /* fs/xfs/libxfs/xfs_format.h begin */ typedef __be64 xfs_timestamp_t; /* fs/xfs/libxfs/xfs_format.h end */ When the 'bigtime' feature is disabled, this 64-bit type is splitted into two parts of 32-bit, one part is encoded for seconds since 1970-01-01 00:00:00 UTC, the other part is encoded for nanoseconds above the seconds, this two parts are the type of 'xfs_legacy_timestamp' and the min and max time value of this type are defined as macros 'XFS_LEGACY_TIME_MIN' and 'XFS_LEGACY_TIME_MAX': /* fs/xfs/libxfs/xfs_format.h begin */ struct xfs_legacy_timestamp { __be32 t_sec; /* timestamp seconds */ __be32 t_nsec; /* timestamp nanoseconds */ }; #define XFS_LEGACY_TIME_MIN ((int64_t)S32_MIN) #define XFS_LEGACY_TIME_MAX ((int64_t)S32_MAX) /* fs/xfs/libxfs/xfs_format.h end */ /* include/linux/limits.h begin */ #define U32_MAX ((u32)~0U) #define S32_MAX ((s32)(U32_MAX >> 1)) #define S32_MIN ((s32)(-S32_MAX - 1)) /* include/linux/limits.h end */ 'XFS_LEGACY_TIME_MIN' is the min time value of the 'xfs_legacy_timestamp', that is -(2^31) seconds relative to the 1970-01-01 00:00:00 UTC, it can be converted to human-friendly time value by 'date' command: /* command begin */ [root@~]# date --utc -d '@0' +'%Y-%m-%d %H:%M:%S' 1970-01-01 00:00:00 [root@~]# date --utc -d "@`echo '-(2^31)'|bc`" +'%Y-%m-%d %H:%M:%S' 1901-12-13 20:45:52 [root@~]# /* command end */ When 'bigtime' feature is enabled, this 64-bit type becomes a 64-bit nanoseconds counter, with the start time value is the min time value of 'xfs_legacy_timestamp'(start time means the value of 64-bit nanoseconds counter is 0). We have already caculated the min time value of 'xfs_legacy_timestamp', that is 1901-12-13 20:45:52 UTC, but the comment for the start time value of inode with 'bigtime' feature enabled writes the value is 1901-12-31 20:45:52 UTC: /* fs/xfs/libxfs/xfs_format.h begin */ /* * XFS Timestamps * ============== * When the bigtime feature is enabled, ondisk inode timestamps become an * unsigned 64-bit nanoseconds counter. This means that the bigtime inode * timestamp epoch is the start of the classic timestamp range, which is * Dec 31 20:45:52 UTC 1901. ... ... */ /* fs/xfs/libxfs/xfs_format.h end */ That is a typo, and this patch corrects the typo, from 'Dec 31' to 'Dec 13'. Suggested-by: Darrick J. Wong Signed-off-by: Xiaole He Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index afdfc8108c5f..b55bdfa9c8a8 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -704,7 +704,7 @@ struct xfs_agfl { * When the bigtime feature is enabled, ondisk inode timestamps become an * unsigned 64-bit nanoseconds counter. This means that the bigtime inode * timestamp epoch is the start of the classic timestamp range, which is - * Dec 31 20:45:52 UTC 1901. Because the epochs are not the same, callers + * Dec 13 20:45:52 UTC 1901. Because the epochs are not the same, callers * /must/ use the bigtime conversion functions when encoding and decoding raw * timestamps. */ -- cgit v1.2.3 From 3f52e016af600982989b5dee958d313c52483c92 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 18 Jul 2022 10:13:48 -0700 Subject: xfs: delete unnecessary NULL checks These NULL check are no long needed after commit 2ed5b09b3e8f ("xfs: make inode attribute forks a permanent part of struct xfs_inode"). Signed-off-by: Dan Carpenter Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_inode_fork.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 7c34184448e6..fa699f3792bf 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -724,8 +724,7 @@ xfs_ifork_verify_local_attr( if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp ? ifp->if_u1.if_data : NULL, - ifp ? ifp->if_bytes : 0, fa); + ifp->if_u1.if_data, ifp->if_bytes, fa); return -EFSCORRUPTED; } -- cgit v1.2.3 From 231f91ab504ecebcb88e942341b3d7dd91de45f1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 18 Jul 2022 18:20:37 -0700 Subject: xfs: xfs_buf cache destroy isn't RCU safe Darrick and Sachin Sant reported that xfs/435 and xfs/436 would report an non-empty xfs_buf slab on module remove. This isn't easily to reproduce, but is clearly a side effect of converting the buffer caceh to RUC freeing and lockless lookups. Sachin bisected and Darrick hit it when testing the patchset directly. Turns out that the xfs_buf slab is not destroyed when all the other XFS slab caches are destroyed. Instead, it's got it's own little wrapper function that gets called separately, and so it doesn't have an rcu_barrier() call in it that is needed to drain all the rcu callbacks before the slab is destroyed. Fix it by removing the xfs_buf_init/terminate wrappers that just allocate and destroy the xfs_buf slab, and move them to the same place that all the other slab caches are set up and destroyed. Reported-and-tested-by: Sachin Sant Fixes: 298f34224506 ("xfs: lockless buffer lookup") Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_buf.c | 25 +------------------------ fs/xfs/xfs_buf.h | 6 ++---- fs/xfs/xfs_super.c | 22 +++++++++++++--------- 3 files changed, 16 insertions(+), 37 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 6dac5583977f..ecceef8b4af3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -21,7 +21,7 @@ #include "xfs_error.h" #include "xfs_ag.h" -static struct kmem_cache *xfs_buf_cache; +struct kmem_cache *xfs_buf_cache; /* * Locking orders @@ -2308,29 +2308,6 @@ xfs_buf_delwri_pushbuf( return error; } -int __init -xfs_buf_init(void) -{ - xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, - SLAB_HWCACHE_ALIGN | - SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD, - NULL); - if (!xfs_buf_cache) - goto out; - - return 0; - - out: - return -ENOMEM; -} - -void -xfs_buf_terminate(void) -{ - kmem_cache_destroy(xfs_buf_cache); -} - void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) { /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 02b3c1635ec3..549c60942208 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -15,6 +15,8 @@ #include #include +extern struct kmem_cache *xfs_buf_cache; + /* * Base types */ @@ -307,10 +309,6 @@ extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); -/* Buffer Daemon Setup Routines */ -extern int xfs_buf_init(void); -extern void xfs_buf_terminate(void); - static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) { return bp->b_maps[0].bm_bn; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4edee1d3784a..3d27ba1295c9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1967,11 +1967,19 @@ xfs_init_caches(void) { int error; + xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + SLAB_HWCACHE_ALIGN | + SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD, + NULL); + if (!xfs_buf_cache) + goto out; + xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket", sizeof(struct xlog_ticket), 0, 0, NULL); if (!xfs_log_ticket_cache) - goto out; + goto out_destroy_buf_cache; error = xfs_btree_init_cur_caches(); if (error) @@ -2145,6 +2153,8 @@ xfs_init_caches(void) xfs_btree_destroy_cur_caches(); out_destroy_log_ticket_cache: kmem_cache_destroy(xfs_log_ticket_cache); + out_destroy_buf_cache: + kmem_cache_destroy(xfs_buf_cache); out: return -ENOMEM; } @@ -2178,6 +2188,7 @@ xfs_destroy_caches(void) xfs_defer_destroy_item_caches(); xfs_btree_destroy_cur_caches(); kmem_cache_destroy(xfs_log_ticket_cache); + kmem_cache_destroy(xfs_buf_cache); } STATIC int __init @@ -2283,13 +2294,9 @@ init_xfs_fs(void) if (error) goto out_destroy_wq; - error = xfs_buf_init(); - if (error) - goto out_mru_cache_uninit; - error = xfs_init_procfs(); if (error) - goto out_buf_terminate; + goto out_mru_cache_uninit; error = xfs_sysctl_register(); if (error) @@ -2346,8 +2353,6 @@ init_xfs_fs(void) xfs_sysctl_unregister(); out_cleanup_procfs: xfs_cleanup_procfs(); - out_buf_terminate: - xfs_buf_terminate(); out_mru_cache_uninit: xfs_mru_cache_uninit(); out_destroy_wq: @@ -2373,7 +2378,6 @@ exit_xfs_fs(void) kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); - xfs_buf_terminate(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_caches(); -- cgit v1.2.3 From 1a53d3d426411326c22e591f0dfa8958db56485a Mon Sep 17 00:00:00 2001 From: sunliming Date: Mon, 18 Jul 2022 18:59:03 -0700 Subject: xfs: fix for variable set but not used warning Fix below kernel warning: fs/xfs/scrub/repair.c:539:19: warning: variable 'agno' set but not used [-Wunused-but-set-variable] Reported-by: kernel test robot Signed-off-by: sunliming Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/repair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 5b5273505931..c18bd039fce9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -536,14 +536,12 @@ xrep_reap_block( { struct xfs_btree_cur *cur; struct xfs_buf *agf_bp = NULL; - xfs_agnumber_t agno; xfs_agblock_t agbno; bool has_other_rmap; int error; - agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - ASSERT(agno == sc->sa.pag->pag_agno); + ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); /* * If we are repairing per-inode metadata, we need to read in the AGF -- cgit v1.2.3 From c78c2d0903183a41beb90c56a923e30f90fa91b9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 19 Jul 2022 09:14:55 -0700 Subject: xfs: don't leak memory when attr fork loading fails I observed the following evidence of a memory leak while running xfs/399 from the xfs fsck test suite (edited for brevity): XFS (sde): Metadata corruption detected at xfs_attr_shortform_verify_struct.part.0+0x7b/0xb0 [xfs], inode 0x1172 attr fork XFS: Assertion failed: ip->i_af.if_u1.if_data == NULL, file: fs/xfs/libxfs/xfs_inode_fork.c, line: 315 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 91635 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs] CPU: 2 PID: 91635 Comm: xfs_scrub Tainted: G W 5.19.0-rc7-xfsx #rc7 6e6475eb29fd9dda3181f81b7ca7ff961d277a40 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 RIP: 0010:assfail+0x46/0x4a [xfs] Call Trace: xfs_ifork_zap_attr+0x7c/0xb0 xfs_iformat_attr_fork+0x86/0x110 xfs_inode_from_disk+0x41d/0x480 xfs_iget+0x389/0xd70 xfs_bulkstat_one_int+0x5b/0x540 xfs_bulkstat_iwalk+0x1e/0x30 xfs_iwalk_ag_recs+0xd1/0x160 xfs_iwalk_run_callbacks+0xb9/0x180 xfs_iwalk_ag+0x1d8/0x2e0 xfs_iwalk+0x141/0x220 xfs_bulkstat+0x105/0x180 xfs_ioc_bulkstat.constprop.0.isra.0+0xc5/0x130 xfs_file_ioctl+0xa5f/0xef0 __x64_sys_ioctl+0x82/0xa0 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 This newly-added assertion checks that there aren't any incore data structures hanging off the incore fork when we're trying to reset its contents. From the call trace, it is evident that iget was trying to construct an incore inode from the ondisk inode, but the attr fork verifier failed and we were trying to undo all the memory allocations that we had done earlier. The three assertions in xfs_ifork_zap_attr check that the caller has already called xfs_idestroy_fork, which clearly has not been done here. As the zap function then zeroes the pointers, we've effectively leaked the memory. The shortest change would have been to insert an extra call to xfs_idestroy_fork, but it makes more sense to bundle the _idestroy_fork call into _zap_attr, since all other callsites call _idestroy_fork immediately prior to calling _zap_attr. IOWs, it eliminates one way to fail. Note: This change only applies cleanly to 2ed5b09b3e8f, since we just reworked the attr fork lifetime. However, I think this memory leak has existed since 0f45a1b20cd8, since the chain xfs_iformat_attr_fork -> xfs_iformat_local -> xfs_init_local_fork will allocate ifp->if_u1.if_data, but if xfs_ifork_verify_local_attr fails, xfs_iformat_attr_fork will free i_afp without freeing any of the stuff hanging off i_afp. The solution for older kernels I think is to add the missing call to xfs_idestroy_fork just prior to calling kmem_cache_free. Found by fuzzing a.sfattr.hdr.totsize = lastbit in xfs/399. Fixes: 2ed5b09b3e8f ("xfs: make inode attribute forks a permanent part of struct xfs_inode") Probably-Fixes: 0f45a1b20cd8 ("xfs: improve local fork verification") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.c | 1 - fs/xfs/libxfs/xfs_inode_fork.c | 5 +---- fs/xfs/xfs_attr_inactive.c | 1 - fs/xfs/xfs_icache.c | 1 - 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 5bd554b88d99..beee51ad75ce 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -799,7 +799,6 @@ xfs_attr_fork_remove( { ASSERT(ip->i_af.if_nextents == 0); - xfs_idestroy_fork(&ip->i_af); xfs_ifork_zap_attr(ip); ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index fa699f3792bf..9327a4f39206 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -290,10 +290,7 @@ void xfs_ifork_zap_attr( struct xfs_inode *ip) { - ASSERT(ip->i_af.if_broot == NULL); - ASSERT(ip->i_af.if_u1.if_data == NULL); - ASSERT(ip->i_af.if_height == 0); - + xfs_idestroy_fork(&ip->i_af); memset(&ip->i_af, 0, sizeof(struct xfs_ifork)); ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; } diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 895ba8b7a26c..5db87b34fb6e 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -385,7 +385,6 @@ out_cancel: xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ - xfs_idestroy_fork(&dp->i_af); xfs_ifork_zap_attr(dp); if (lock_mode) xfs_iunlock(dp, lock_mode); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 1f07b5b8ba3f..e3b2304bb4d2 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -133,7 +133,6 @@ xfs_inode_free_callback( break; } - xfs_idestroy_fork(&ip->i_af); xfs_ifork_zap_attr(ip); if (ip->i_cowfp) { -- cgit v1.2.3 From 29d286d0ce10c43ec9e2fa2902ed4eff29d15cd2 Mon Sep 17 00:00:00 2001 From: Xin Gao Date: Thu, 21 Jul 2022 19:47:13 -0700 Subject: xfs: Fix comment typo The double `the' is duplicated in line 552, remove one. Signed-off-by: Xin Gao Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_dquot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 5a6c3c3c4de2..8fb90da89787 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -549,7 +549,7 @@ xfs_dquot_check_type( * at the same time. The non-user quota file can be switched between * group and project quota uses depending on the mount options, which * means that we can encounter the other type when we try to load quota - * defaults. Quotacheck will soon reset the the entire quota file + * defaults. Quotacheck will soon reset the entire quota file * (including the root dquot) anyway, but don't log scary corruption * reports to dmesg. */ -- cgit v1.2.3 From 4869b6e84a23076eca813d5739d6717976aeae66 Mon Sep 17 00:00:00 2001 From: Slark Xiao Date: Fri, 22 Jul 2022 10:58:17 -0700 Subject: xfs: Fix typo 'the the' in comment Replace 'the the' with 'the' in the comment. Signed-off-by: Slark Xiao Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 41557c430cb6..e2bdf089c0a3 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -84,7 +84,7 @@ xfs_prealloc_blocks( /* * The number of blocks per AG that we withhold from xfs_mod_fdblocks to * guarantee that we can refill the AGFL prior to allocating space in a nearly - * full AG. Although the the space described by the free space btrees, the + * full AG. Although the space described by the free space btrees, the * blocks used by the freesp btrees themselves, and the blocks owned by the * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk * free space in the AG drop so low that the free space btrees cannot refill an -- cgit v1.2.3 From 001c179c4e26d04db8c9f5e3fef9558b58356be6 Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 27 Jul 2022 17:21:52 -0700 Subject: xfs: fix NULL pointer dereference in xfs_getbmap() Reproducer: 1. fallocate -l 100M image 2. mkfs.xfs -f image 3. mount image /mnt 4. setxattr("/mnt", "trusted.overlay.upper", NULL, 0, XATTR_CREATE) 5. char arg[32] = "\x01\xff\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x08\x00\x00\x00\xc6\x2a\xf7"; fd = open("/mnt", O_RDONLY|O_DIRECTORY); ioctl(fd, _IOC(_IOC_READ|_IOC_WRITE, 0x58, 0x2c, 0x20), arg); NULL pointer dereference will occur when race happens between xfs_getbmap() and xfs_bmap_set_attrforkoff(): ioctl | setxattr ----------------------------|--------------------------- xfs_getbmap | xfs_ifork_ptr | xfs_inode_has_attr_fork | ip->i_forkoff == 0 | return NULL | ifp == NULL | | xfs_bmap_set_attrforkoff | ip->i_forkoff > 0 xfs_inode_has_attr_fork | ip->i_forkoff > 0 | ifp == NULL | ifp->if_format | Fix this by locking i_lock before xfs_ifork_ptr(). Fixes: abbf9e8a4507 ("xfs: rewrite getbmap using the xfs_iext_* helpers") Signed-off-by: ChenXiaoSong Signed-off-by: Guo Xuenan Reviewed-by: Darrick J. Wong [djwong: added fixes tag] Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 74f96e1aa5cd..04d0c2bff67c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -439,29 +439,28 @@ xfs_getbmap( whichfork = XFS_COW_FORK; else whichfork = XFS_DATA_FORK; - ifp = xfs_ifork_ptr(ip, whichfork); xfs_ilock(ip, XFS_IOLOCK_SHARED); switch (whichfork) { case XFS_ATTR_FORK: + lock = xfs_ilock_attr_map_shared(ip); if (!xfs_inode_has_attr_fork(ip)) - goto out_unlock_iolock; + goto out_unlock_ilock; max_len = 1LL << 32; - lock = xfs_ilock_attr_map_shared(ip); break; case XFS_COW_FORK: + lock = XFS_ILOCK_SHARED; + xfs_ilock(ip, lock); + /* No CoW fork? Just return */ - if (!ifp) - goto out_unlock_iolock; + if (!xfs_ifork_ptr(ip, whichfork)) + goto out_unlock_ilock; if (xfs_get_cowextsz_hint(ip)) max_len = mp->m_super->s_maxbytes; else max_len = XFS_ISIZE(ip); - - lock = XFS_ILOCK_SHARED; - xfs_ilock(ip, lock); break; case XFS_DATA_FORK: if (!(iflags & BMV_IF_DELALLOC) && @@ -491,6 +490,8 @@ xfs_getbmap( break; } + ifp = xfs_ifork_ptr(ip, whichfork); + switch (ifp->if_format) { case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: -- cgit v1.2.3 From 5e9466a5d0604e20082d828008047b3165592caf Mon Sep 17 00:00:00 2001 From: Xie Shaowen Date: Sun, 31 Jul 2022 09:20:25 -0700 Subject: xfs: delete extra space and tab in blank line delete extra space and tab in blank line, there is no functional change. Reported-by: Hacash Robot Signed-off-by: Xie Shaowen Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extfree_item.c | 12 ++++++------ fs/xfs/xfs_log.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 0d0a0b37d8c5..27ccfcd82f04 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -188,12 +188,12 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; - uint len = sizeof(xfs_efi_log_format_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); - uint len32 = sizeof(xfs_efi_log_format_32_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); - uint len64 = sizeof(xfs_efi_log_format_64_t) + - (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); + uint len = sizeof(xfs_efi_log_format_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); + uint len32 = sizeof(xfs_efi_log_format_32_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_32_t); + uint len64 = sizeof(xfs_efi_log_format_64_t) + + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_64_t); if (buf->i_len == len) { memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 498cbb49392b..4b1c0a9c6368 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2004,7 +2004,7 @@ xlog_calc_iclog_size( } /* - * Flush out the in-core log (iclog) to the on-disk log in an asynchronous + * Flush out the in-core log (iclog) to the on-disk log in an asynchronous * fashion. Previously, we should have moved the current iclog * ptr in the log to point to the next available iclog. This allows further * write to continue while this code syncs out an iclog ready to go. -- cgit v1.2.3