From 93defd5a15dd74791532692cc59be3a1aaa045fe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Mar 2022 11:38:27 -0700 Subject: xfs: document the XFS_ALLOC_AGFL_RESERVE constant Currently, we use this undocumented macro to encode the minimum number of blocks needed to replenish a completely empty AGFL when an AG is nearly full. This has lead to confusion on the part of the maintainers, so let's document what the value actually means, and move it to xfs_alloc.c since it's not used outside of that module. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 28 +++++++++++++++++++++++----- fs/xfs/libxfs/xfs_alloc.h | 1 - 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 353e53b892e6..b52ed339727f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -82,6 +82,24 @@ xfs_prealloc_blocks( } /* + * The number of blocks per AG that we withhold from xfs_mod_fdblocks to + * guarantee that we can refill the AGFL prior to allocating space in a nearly + * full AG. Although the the space described by the free space btrees, the + * blocks used by the freesp btrees themselves, and the blocks owned by the + * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk + * free space in the AG drop so low that the free space btrees cannot refill an + * empty AGFL up to the minimum level. Rather than grind through empty AGs + * until the fs goes down, we subtract this many AG blocks from the incore + * fdblocks to ensure user allocation does not overcommit the space the + * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to + * withhold space from xfs_mod_fdblocks, so we do not account for that here. + */ +#define XFS_ALLOCBT_AGFL_RESERVE 4 + +/* + * Compute the number of blocks that we set aside to guarantee the ability to + * refill the AGFL and handle a full bmap btree split. + * * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of * AGF buffer (PV 947395), we place constraints on the relationship among * actual allocations for data blocks, freelist blocks, and potential file data @@ -93,14 +111,14 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a - * potential split of the file's bmap btree. + * For each AG, we need to reserve enough blocks to replenish a totally empty + * AGFL and 4 more to handle a potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); + return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); } /* @@ -124,12 +142,12 @@ xfs_alloc_ag_max_usable( unsigned int blocks; blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ - blocks += XFS_ALLOC_AGFL_RESERVE; + blocks += XFS_ALLOCBT_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ if (xfs_has_rmapbt(mp)) - blocks++; /* rmap root block */ + blocks++; /* rmap root block */ if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 1c14a0b1abea..d4c057b764f9 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -88,7 +88,6 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ /* freespace limit calculations */ -#define XFS_ALLOC_AGFL_RESERVE 4 unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); -- cgit v1.2.3 From c8c568259772751a14e969b7230990508de73d9d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Mar 2022 11:54:18 -0700 Subject: xfs: don't include bnobt blocks when reserving free block pool xfs_reserve_blocks controls the size of the user-visible free space reserve pool. Given the difference between the current and requested pool sizes, it will try to reserve free space from fdblocks. However, the amount requested from fdblocks is also constrained by the amount of space that we think xfs_mod_fdblocks will give us. If we forget to subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will return ENOSPC and we'll hang the kernel at mount due to the infinite loop. In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand out the "free space" used by the free space btrees, because some portion of the free space btrees hold in reserve space for future btree expansion. Unfortunately, xfs_reserve_blocks' estimation of the number of blocks that it could request from xfs_mod_fdblocks was not updated to include m_allocbt_blks, so if space is extremely low, the caller hangs. Fix this by creating a function to estimate the number of blocks that can be reserved from fdblocks, which needs to exclude the set-aside and m_allocbt_blks. Found by running xfs/306 (which formats a single-AG 20MB filesystem) with an fstests configuration that specifies a 1k blocksize and a specially crafted log size that will consume 7/8 of the space (17920 blocks, specifically) in that AG. Cc: Brian Foster Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 2 +- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 33e26690a8c4..710e857bb825 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -434,7 +434,7 @@ xfs_reserve_blocks( error = -ENOSPC; do { free = percpu_counter_sum(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp); if (free <= 0) break; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bed73e8002a5..29ffa8c42795 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1146,7 +1146,7 @@ xfs_mod_fdblocks( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + set_aside = xfs_fdblocks_unavailable(mp); percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, XFS_FDBLOCKS_BATCH) >= 0) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 00720a02e761..f6dc19de8322 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -479,6 +479,21 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +static inline uint64_t +xfs_fdblocks_unavailable( + struct xfs_mount *mp) +{ + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); -- cgit v1.2.3 From 15f04fdc75aaaa1cccb0b8b3af1be290e118a7bc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 11 Mar 2022 10:56:01 -0800 Subject: xfs: remove infinite loop when reserving free block pool Infinite loops in kernel code are scary. Calls to xfs_reserve_blocks should be rare (people should just use the defaults!) so we really don't need to try so hard. Simplify the logic here by removing the infinite loop. Cc: Brian Foster Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 710e857bb825..3c6d9d6836ef 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -430,46 +430,36 @@ xfs_reserve_blocks( * If the request is larger than the current reservation, reserve the * blocks before we update the reserve counters. Sample m_fdblocks and * perform a partial reservation if the request exceeds free space. + * + * The code below estimates how many blocks it can request from + * fdblocks to stash in the reserve pool. This is a classic TOCTOU + * race since fdblocks updates are not always coordinated via + * m_sb_lock. */ - error = -ENOSPC; - do { - free = percpu_counter_sum(&mp->m_fdblocks) - + free = percpu_counter_sum(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); - if (free <= 0) - break; - - delta = request - mp->m_resblks; - lcounter = free - delta; - if (lcounter < 0) - /* We can't satisfy the request, just get what we can */ - fdblks_delta = free; - else - fdblks_delta = delta; - + delta = request - mp->m_resblks; + if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block - * count or we'll get an ENOSPC. If we get a ENOSPC, it means - * things changed while we were calculating fdblks_delta and so - * we should try again to see if there is anything left to - * reserve. - * - * Don't set the reserved flag here - we don't want to reserve - * the extra reserve blocks from the reserve..... + * count or we'll get an ENOSPC. Don't set the reserved flag + * here - we don't want to reserve the extra reserve blocks + * from the reserve. */ + fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); spin_lock(&mp->m_sb_lock); - } while (error == -ENOSPC); - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ - if (!error && fdblks_delta) { - mp->m_resblks += fdblks_delta; - mp->m_resblks_avail += fdblks_delta; + /* + * Update the reserve counters if blocks have been successfully + * allocated. + */ + if (!error) { + mp->m_resblks += fdblks_delta; + mp->m_resblks_avail += fdblks_delta; + } } - out: if (outval) { outval->resblks = mp->m_resblks; -- cgit v1.2.3 From 0baa2657dc4d79202148be79a3dc36c35f425060 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 24 Mar 2022 12:43:32 -0700 Subject: xfs: always succeed at setting the reserve pool size Nowadays, xfs_mod_fdblocks will always choose to fill the reserve pool with freed blocks before adding to fdblocks. Therefore, we can change the behavior of xfs_reserve_blocks slightly -- setting the target size of the pool should always succeed, since a deficiency will eventually be made up as blocks get freed. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 3c6d9d6836ef..5c2bea1e12a8 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -434,11 +434,14 @@ xfs_reserve_blocks( * The code below estimates how many blocks it can request from * fdblocks to stash in the reserve pool. This is a classic TOCTOU * race since fdblocks updates are not always coordinated via - * m_sb_lock. + * m_sb_lock. Set the reserve size even if there's not enough free + * space to fill it because mod_fdblocks will refill an undersized + * reserve when it can. */ free = percpu_counter_sum(&mp->m_fdblocks) - xfs_fdblocks_unavailable(mp); delta = request - mp->m_resblks; + mp->m_resblks = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block @@ -455,10 +458,8 @@ xfs_reserve_blocks( * Update the reserve counters if blocks have been successfully * allocated. */ - if (!error) { - mp->m_resblks += fdblks_delta; + if (!error) mp->m_resblks_avail += fdblks_delta; - } } out: if (outval) { -- cgit v1.2.3 From 82be38bcf8a2e056b4c99ce79a3827fa743df6ec Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 24 Mar 2022 10:57:07 -0700 Subject: xfs: fix overfilling of reserve pool Due to cycling of m_sb_lock, it's possible for multiple callers of xfs_reserve_blocks to race at changing the pool size, subtracting blocks from fdblocks, and actually putting it in the pool. The result of all this is that we can overfill the reserve pool to hilarious levels. xfs_mod_fdblocks, when called with a positive value, already knows how to take freed blocks and either fill the reserve until it's full, or put them in fdblocks. Use that instead of setting m_resblks_avail directly. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5c2bea1e12a8..5b5b68affe66 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -448,18 +448,17 @@ xfs_reserve_blocks( * count or we'll get an ENOSPC. Don't set the reserved flag * here - we don't want to reserve the extra reserve blocks * from the reserve. + * + * The desired reserve size can change after we drop the lock. + * Use mod_fdblocks to put the space into the reserve or into + * fdblocks as appropriate. */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); error = xfs_mod_fdblocks(mp, -fdblks_delta, 0); - spin_lock(&mp->m_sb_lock); - - /* - * Update the reserve counters if blocks have been successfully - * allocated. - */ if (!error) - mp->m_resblks_avail += fdblks_delta; + xfs_mod_fdblocks(mp, fdblks_delta, 0); + spin_lock(&mp->m_sb_lock); } out: if (outval) { -- cgit v1.2.3 From 85bcfa26f9a3782be37d4feafd49668b98b8bdbe Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 16 Mar 2022 13:38:43 -0700 Subject: xfs: don't report reserved bnobt space as available On a modern filesystem, we don't allow userspace to allocate blocks for data storage from the per-AG space reservations, the user-controlled reservation pool that prevents ENOSPC in the middle of internal operations, or the internal per-AG set-aside that prevents unwanted filesystem shutdowns due to ENOSPC during a bmap btree split. Since we now consider freespace btree blocks as unavailable for allocation for data storage, we shouldn't report those blocks via statfs either. This makes the numbers that we return via the statfs f_bavail and f_bfree fields a more conservative estimate of actual free space. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster Reviewed-by: Dave Chinner --- fs/xfs/xfs_fsops.c | 2 +- fs/xfs/xfs_super.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5b5b68affe66..196e2c51309c 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -347,7 +347,7 @@ xfs_fs_counts( cnt->allocino = percpu_counter_read_positive(&mp->m_icount); cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - mp->m_alloc_set_aside; + xfs_fdblocks_unavailable(mp); spin_lock(&mp->m_sb_lock); cnt->freertx = mp->m_sb.sb_frextents; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index d84714e4e46a..54be9d64093e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -815,7 +815,8 @@ xfs_fs_statfs( spin_unlock(&mp->m_sb_lock); /* make sure statp->f_bfree does not underflow */ - statp->f_bfree = max_t(int64_t, fdblocks - mp->m_alloc_set_aside, 0); + statp->f_bfree = max_t(int64_t, 0, + fdblocks - xfs_fdblocks_unavailable(mp)); statp->f_bavail = statp->f_bfree; fakeinos = XFS_FSB_TO_INO(mp, statp->f_bfree); -- cgit v1.2.3 From d2d7c0473586d2f22e85d615275f34cf19f94447 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:21:59 -0700 Subject: xfs: aborting inodes on shutdown may need buffer lock Most buffer io list operations are run with the bp->b_lock held, but xfs_iflush_abort() can be called without the buffer lock being held resulting in inodes being removed from the buffer list while other list operations are occurring. This causes problems with corrupted bp->b_io_list inode lists during filesystem shutdown, leading to traversals that never end, double removals from the AIL, etc. Fix this by passing the buffer to xfs_iflush_abort() if we have it locked. If the inode is attached to the buffer, we're going to have to remove it from the buffer list and we'd have to get the buffer off the inode log item to do that anyway. If we don't have a buffer passed in (e.g. from xfs_reclaim_inode()) then we can determine if the inode has a log item and if it is attached to a buffer before we do anything else. If it does have an attached buffer, we can lock it safely (because the inode has a reference to it) and then perform the inode abort. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_inode_item.c | 162 +++++++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_inode_item.h | 1 + 4 files changed, 136 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4148cdf7ce4a..6c7267451b82 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -883,7 +883,7 @@ xfs_reclaim_inode( */ if (xlog_is_shutdown(ip->i_mount->m_log)) { xfs_iunpin_wait(ip); - xfs_iflush_abort(ip); + xfs_iflush_shutdown_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 26227d26f274..9de6205fe134 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3631,7 +3631,7 @@ xfs_iflush_cluster( /* * We must use the safe variant here as on shutdown xfs_iflush_abort() - * can remove itself from the list. + * will remove itself from the list. */ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { iip = (struct xfs_inode_log_item *)lip; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 11158fa81a09..9e6ef55cf29e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -544,10 +544,17 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error; - ASSERT(iip->ili_item.li_buf); + if (!bp || (ip->i_flags & XFS_ISTALE)) { + /* + * Inode item/buffer is being being aborted due to cluster + * buffer deletion. Trigger a log force to have that operation + * completed and items removed from the AIL before the next push + * attempt. + */ + return XFS_ITEM_PINNED; + } - if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || - (ip->i_flags & XFS_ISTALE)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; if (xfs_iflags_test(ip, XFS_IFLUSHING)) @@ -834,46 +841,143 @@ xfs_buf_inode_io_fail( } /* - * This is the inode flushing abort routine. It is called when - * the filesystem is shutting down to clean up the inode state. It is - * responsible for removing the inode item from the AIL if it has not been - * re-logged and clearing the inode's flush state. + * Clear the inode logging fields so no more flushes are attempted. If we are + * on a buffer list, it is now safe to remove it because the buffer is + * guaranteed to be locked. The caller will drop the reference to the buffer + * the log item held. + */ +static void +xfs_iflush_abort_clean( + struct xfs_inode_log_item *iip) +{ + iip->ili_last_fields = 0; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); +} + +/* + * Abort flushing the inode from a context holding the cluster buffer locked. + * + * This is the normal runtime method of aborting writeback of an inode that is + * attached to a cluster buffer. It occurs when the inode and the backing + * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster + * flushing or buffer IO completion encounters a log shutdown situation. + * + * If we need to abort inode writeback and we don't already hold the buffer + * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be + * necessary in a shutdown situation. */ void xfs_iflush_abort( struct xfs_inode *ip) { struct xfs_inode_log_item *iip = ip->i_itemp; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp; - if (iip) { - /* - * Clear the failed bit before removing the item from the AIL so - * xfs_trans_ail_delete() doesn't try to clear and release the - * buffer attached to the log item before we are done with it. - */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); - xfs_trans_ail_delete(&iip->ili_item, 0); + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + /* + * Remove the inode item from the AIL before we clear its internal + * state. Whilst the inode is in the AIL, it should have a valid buffer + * pointer for push operations to access - it is only safe to remove the + * inode from the buffer once it has been removed from the AIL. + * + * We also clear the failed bit before removing the item from the AIL + * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer + * references the inode item owns and needs to hold until we've fully + * aborted the inode log item and detached it from the buffer. + */ + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + + /* + * Grab the inode buffer so can we release the reference the inode log + * item holds on it. + */ + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + xfs_iflush_abort_clean(iip); + spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + if (bp) + xfs_buf_rele(bp); +} + +/* + * Abort an inode flush in the case of a shutdown filesystem. This can be called + * from anywhere with just an inode reference and does not require holding the + * inode cluster buffer locked. If the inode is attached to a cluster buffer, + * it will grab and lock it safely, then abort the inode flush. + */ +void +xfs_iflush_shutdown_abort( + struct xfs_inode *ip) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp; + + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + if (!bp) { + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + return; + } + + /* + * We have to take a reference to the buffer so that it doesn't get + * freed when we drop the ili_lock and then wait to lock the buffer. + * We'll clean up the extra reference after we pick up the ili_lock + * again. + */ + xfs_buf_hold(bp); + spin_unlock(&iip->ili_lock); + xfs_buf_lock(bp); + + spin_lock(&iip->ili_lock); + if (!iip->ili_item.li_buf) { /* - * Clear the inode logging fields so no more flushes are - * attempted. + * Raced with another removal, hold the only reference + * to bp now. Inode should not be in the AIL now, so just clean + * up and return; */ - spin_lock(&iip->ili_lock); - iip->ili_last_fields = 0; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_flush_lsn = 0; - bp = iip->ili_item.li_buf; - iip->ili_item.li_buf = NULL; - list_del_init(&iip->ili_item.li_bio_list); + ASSERT(list_empty(&iip->ili_item.li_bio_list)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)); + xfs_iflush_abort_clean(iip); spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_buf_relse(bp); + return; } - xfs_iflags_clear(ip, XFS_IFLUSHING); - if (bp) - xfs_buf_rele(bp); + + /* + * Got two references to bp. The first will get dropped by + * xfs_iflush_abort() when the item is removed from the buffer list, but + * we can't drop our reference until _abort() returns because we have to + * unlock the buffer as well. Hence we abort and then unlock and release + * our reference to the buffer. + */ + ASSERT(iip->ili_item.li_buf == bp); + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + xfs_buf_relse(bp); } + /* * convert an xfs_inode_log_format struct from the old 32 bit version * (which can have different field alignments) to the native 64 bit version diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 1a302000d604..bbd836a44ff0 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -44,6 +44,7 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_shutdown_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); -- cgit v1.2.3 From ab9c81ef321f90dd208b1d4809c196c2794e4b15 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:00 -0700 Subject: xfs: shutdown in intent recovery has non-intent items in the AIL generic/388 triggered a failure in RUI recovery due to a corrupted btree record and the system then locked up hard due to a subsequent assert failure while holding a spinlock cancelling intents: XFS (pmem1): Corruption of in-memory data (0x8) detected at xfs_do_force_shutdown+0x1a/0x20 (fs/xfs/xfs_trans.c:964). Shutting down filesystem. XFS (pmem1): Please unmount the filesystem and rectify the problem(s) XFS: Assertion failed: !xlog_item_is_intent(lip), file: fs/xfs/xfs_log_recover.c, line: 2632 Call Trace: xlog_recover_cancel_intents.isra.0+0xd1/0x120 xlog_recover_finish+0xb9/0x110 xfs_log_mount_finish+0x15a/0x1e0 xfs_mountfs+0x540/0x910 xfs_fs_fill_super+0x476/0x830 get_tree_bdev+0x171/0x270 ? xfs_init_fs_context+0x1e0/0x1e0 xfs_fs_get_tree+0x15/0x20 vfs_get_tree+0x24/0xc0 path_mount+0x304/0xba0 ? putname+0x55/0x60 __x64_sys_mount+0x108/0x140 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae Essentially, there's dirty metadata in the AIL from intent recovery transactions, so when we go to cancel the remaining intents we assume that all objects after the first non-intent log item in the AIL are not intents. This is not true. Intent recovery can log new intents to continue the operations the original intent could not complete in a single transaction. The new intents are committed before they are deferred, which means if the CIL commits in the background they will get inserted into the AIL at the head. Hence if we shut down the filesystem while processing intent recovery, the AIL may have new intents active at the current head. Hence this check: /* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. */ if (!xlog_item_is_intent(lip)) { #ifdef DEBUG for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) ASSERT(!xlog_item_is_intent(lip)); #endif break; } in both xlog_recover_process_intents() and log_recover_cancel_intents() is simply not valid. It was valid back when we only had EFI/EFD intents and didn't chain intents, but it hasn't been valid ever since intent recovery could create and commit new intents. Given that crashing the mount task like this pretty much prevents diagnosing what went wrong that lead to the initial failure that triggered intent cancellation, just remove the checks altogether. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 50 ++++++++++++++++-------------------------------- 1 file changed, 17 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 96c997ed2ec8..7758a6706b8c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2519,21 +2519,22 @@ xlog_abort_defer_ops( xfs_defer_ops_capture_free(mp, dfc); } } + /* * When this is called, all of the log intent items which did not have - * corresponding log done items should be in the AIL. What we do now - * is update the data structures associated with each one. + * corresponding log done items should be in the AIL. What we do now is update + * the data structures associated with each one. * - * Since we process the log intent items in normal transactions, they - * will be removed at some point after the commit. This prevents us - * from just walking down the list processing each one. We'll use a - * flag in the intent item to skip those that we've already processed - * and use the AIL iteration mechanism's generation count to try to - * speed this up at least a bit. + * Since we process the log intent items in normal transactions, they will be + * removed at some point after the commit. This prevents us from just walking + * down the list processing each one. We'll use a flag in the intent item to + * skip those that we've already processed and use the AIL iteration mechanism's + * generation count to try to speed this up at least a bit. * - * When we start, we know that the intents are the only things in the - * AIL. As we process them, however, other items are added to the - * AIL. + * When we start, we know that the intents are the only things in the AIL. As we + * process them, however, other items are added to the AIL. Hence we know we + * have started recovery on all the pending intents when we find an non-intent + * item in the AIL. */ STATIC int xlog_recover_process_intents( @@ -2556,17 +2557,8 @@ xlog_recover_process_intents( for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } /* * We should never see a redo item with a LSN higher than @@ -2607,8 +2599,9 @@ err: } /* - * A cancel occurs when the mount has failed and we're bailing out. - * Release all pending log intent items so they don't pin the AIL. + * A cancel occurs when the mount has failed and we're bailing out. Release all + * pending log intent items that we haven't started recovery on so they don't + * pin the AIL. */ STATIC void xlog_recover_cancel_intents( @@ -2622,17 +2615,8 @@ xlog_recover_cancel_intents( spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { - /* - * We're done when we see something other than an intent. - * There should be no intents left in the AIL now. - */ - if (!xlog_item_is_intent(lip)) { -#ifdef DEBUG - for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) - ASSERT(!xlog_item_is_intent(lip)); -#endif + if (!xlog_item_is_intent(lip)) break; - } spin_unlock(&ailp->ail_lock); lip->li_ops->iop_release(lip); -- cgit v1.2.3 From cd6f79d1fb324968a3bae92f82eeb7d28ca1fd22 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:00 -0700 Subject: xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks Brian reported a null pointer dereference failure during unmount in xfs/006. He tracked the problem down to the AIL being torn down before a log shutdown had completed and removed all the items from the AIL. The failure occurred in this path while unmount was proceeding in another task: xfs_trans_ail_delete+0x102/0x130 [xfs] xfs_buf_item_done+0x22/0x30 [xfs] xfs_buf_ioend+0x73/0x4d0 [xfs] xfs_trans_committed_bulk+0x17e/0x2f0 [xfs] xlog_cil_committed+0x2a9/0x300 [xfs] xlog_cil_process_committed+0x69/0x80 [xfs] xlog_state_shutdown_callbacks+0xce/0xf0 [xfs] xlog_force_shutdown+0xdf/0x150 [xfs] xfs_do_force_shutdown+0x5f/0x150 [xfs] xlog_ioend_work+0x71/0x80 [xfs] process_one_work+0x1c5/0x390 worker_thread+0x30/0x350 kthread+0xd7/0x100 ret_from_fork+0x1f/0x30 This is processing an EIO error to a log write, and it's triggering a force shutdown. This causes the log to be shut down, and then it is running attached iclog callbacks from the shutdown context. That means the fs and log has already been marked as xfs_is_shutdown/xlog_is_shutdown and so high level code will abort (e.g. xfs_trans_commit(), xfs_log_force(), etc) with an error because of shutdown. The umount would have been blocked waiting for a log force completion inside xfs_log_cover() -> xfs_sync_sb(). The first thing for this situation to occur is for xfs_sync_sb() to exit without waiting for the iclog buffer to be comitted to disk. The above trace is the completion routine for the iclog buffer, and it is shutting down the filesystem. xlog_state_shutdown_callbacks() does this: { struct xlog_in_core *iclog; LIST_HEAD(cb_list); spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { if (atomic_read(&iclog->ic_refcnt)) { /* Reference holder will re-run iclog callbacks. */ continue; } list_splice_init(&iclog->ic_callbacks, &cb_list); >>>>>> wake_up_all(&iclog->ic_write_wait); >>>>>> wake_up_all(&iclog->ic_force_wait); } while ((iclog = iclog->ic_next) != log->l_iclog); wake_up_all(&log->l_flush_wait); spin_unlock(&log->l_icloglock); >>>>>> xlog_cil_process_committed(&cb_list); } This wakes any thread waiting on IO completion of the iclog (in this case the umount log force) before shutdown processes all the pending callbacks. That means the xfs_sync_sb() waiting on a sync transaction in xfs_log_force() on iclog->ic_force_wait will get woken before the callbacks attached to that iclog are run. This results in xfs_sync_sb() returning an error, and so unmount unblocks and continues to run whilst the log shutdown is still in progress. Normally this is just fine because the force waiter has nothing to do with AIL operations. But in the case of this unmount path, the log force waiter goes on to tear down the AIL because the log is now shut down and so nothing ever blocks it again from the wait point in xfs_log_cover(). Hence it's a race to see who gets to the AIL first - the unmount code or xlog_cil_process_committed() killing the superblock buffer. To fix this, we just have to change the order of processing in xlog_state_shutdown_callbacks() to run the callbacks before it wakes any task waiting on completion of the iclog. Reported-by: Brian Foster Fixes: aad7272a9208 ("xfs: separate out log shutdown callback processing") Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 5010ce712a3e..e0d47e74c540 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -487,7 +487,10 @@ out_error: * Run all the pending iclog callbacks and wake log force waiters and iclog * space waiters so they can process the newly set shutdown state. We really * don't care what order we process callbacks here because the log is shut down - * and so state cannot change on disk anymore. + * and so state cannot change on disk anymore. However, we cannot wake waiters + * until the callbacks have been processed because we may be in unmount and + * we must ensure that all AIL operations the callbacks perform have completed + * before we tear down the AIL. * * We avoid processing actively referenced iclogs so that we don't run callbacks * while the iclog owner might still be preparing the iclog for IO submssion. @@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks( struct xlog_in_core *iclog; LIST_HEAD(cb_list); - spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { if (atomic_read(&iclog->ic_refcnt)) { @@ -509,14 +511,16 @@ xlog_state_shutdown_callbacks( continue; } list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); + + spin_lock(&log->l_icloglock); wake_up_all(&iclog->ic_write_wait); wake_up_all(&iclog->ic_force_wait); } while ((iclog = iclog->ic_next) != log->l_iclog); wake_up_all(&log->l_flush_wait); - spin_unlock(&log->l_icloglock); - - xlog_cil_process_committed(&cb_list); } /* @@ -583,11 +587,8 @@ xlog_state_release_iclog( * pending iclog callbacks that were waiting on the release of * this iclog. */ - if (last_ref) { - spin_unlock(&log->l_icloglock); + if (last_ref) xlog_state_shutdown_callbacks(log); - spin_lock(&log->l_icloglock); - } return -EIO; } @@ -3903,7 +3904,10 @@ xlog_force_shutdown( wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); + + spin_lock(&log->l_icloglock); xlog_state_shutdown_callbacks(log); + spin_unlock(&log->l_icloglock); return log_error; } -- cgit v1.2.3 From b5f17bec1213a3ed2f4d79ad4c566e00cabe2a9b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:01 -0700 Subject: xfs: log shutdown triggers should only shut down the log We've got a mess on our hands. 1. xfs_trans_commit() cannot cancel transactions because the mount is shut down - that causes dirty, aborted, unlogged log items to sit unpinned in memory and potentially get written to disk before the log is shut down. Hence xfs_trans_commit() can only abort transactions when xlog_is_shutdown() is true. 2. xfs_force_shutdown() is used in places to cause the current modification to be aborted via xfs_trans_commit() because it may be impractical or impossible to cancel the transaction directly, and hence xfs_trans_commit() must cancel transactions when xfs_is_shutdown() is true in this situation. But we can't do that because of #1. 3. Log IO errors cause log shutdowns by calling xfs_force_shutdown() to shut down the mount and then the log from log IO completion. 4. xfs_force_shutdown() can result in a log force being issued, which has to wait for log IO completion before it will mark the log as shut down. If #3 races with some other shutdown trigger that runs a log force, we rely on xfs_force_shutdown() silently ignoring #3 and avoiding shutting down the log until the failed log force completes. 5. To ensure #2 always works, we have to ensure that xfs_force_shutdown() does not return until the the log is shut down. But in the case of #4, this will result in a deadlock because the log Io completion will block waiting for a log force to complete which is blocked waiting for log IO to complete.... So the very first thing we have to do here to untangle this mess is dissociate log shutdown triggers from mount shutdowns. We already have xlog_forced_shutdown, which will atomically transistion to the log a shutdown state. Due to internal asserts it cannot be called multiple times, but was done simply because the only place that could call it was xfs_do_force_shutdown() (i.e. the mount shutdown!) and that could only call it once and once only. So the first thing we do is remove the asserts. We then convert all the internal log shutdown triggers to call xlog_force_shutdown() directly instead of xfs_force_shutdown(). This allows the log shutdown triggers to shut down the log without needing to care about mount based shutdown constraints. This means we shut down the log independently of the mount and the mount may not notice this until it's next attempt to read or modify metadata. At that point (e.g. xfs_trans_commit()) it will see that the log is shutdown, error out and shutdown the mount. To ensure that all the unmount behaviours and asserts track correctly as a result of a log shutdown, propagate the shutdown up to the mount if it is not already set. This keeps the mount and log state in sync, and saves a huge amount of hassle where code fails because of a log shutdown but only checks for mount shutdowns and hence ends up doing the wrong thing. Cleaning up that mess is an exercise for another day. This enables us to address the other problems noted above in followup patches. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 32 +++++++++++++++++++++++--------- fs/xfs/xfs_log_cil.c | 4 ++-- fs/xfs/xfs_log_recover.c | 6 +++--- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_trans_ail.c | 8 ++++---- 5 files changed, 33 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e0d47e74c540..4188ed752169 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1374,7 +1374,7 @@ xlog_ioend_work( */ if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) { xfs_alert(log->l_mp, "log I/O error %d", error); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } xlog_state_done_syncing(iclog); @@ -1913,7 +1913,7 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return; } if (is_vmalloc_addr(iclog->ic_data)) @@ -2488,7 +2488,7 @@ xlog_write( xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "ctx ticket reservation ran out. Need to up reservation"); xlog_print_tic_res(log->l_mp, ticket); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } len = xlog_write_calc_vec_length(ticket, log_vector, optype); @@ -3822,9 +3822,10 @@ xlog_verify_iclog( #endif /* - * Perform a forced shutdown on the log. This should be called once and once - * only by the high level filesystem shutdown code to shut the log subsystem - * down cleanly. + * Perform a forced shutdown on the log. + * + * This can be called from low level log code to trigger a shutdown, or from the + * high level mount shutdown code when the mount shuts down. * * Our main objectives here are to make sure that: * a. if the shutdown was not due to a log IO error, flush the logs to @@ -3833,6 +3834,8 @@ xlog_verify_iclog( * parties to find out. Nothing new gets queued after this is done. * c. Tasks sleeping on log reservations, pinned objects and * other resources get woken up. + * d. The mount is also marked as shut down so that log triggered shutdowns + * still behave the same as if they called xfs_forced_shutdown(). * * Return true if the shutdown cause was a log IO error and we actually shut the * log down. @@ -3851,8 +3854,6 @@ xlog_force_shutdown( if (!log || xlog_in_recovery(log)) return false; - ASSERT(!xlog_is_shutdown(log)); - /* * Flush all the completed transactions to disk before marking the log * being shut down. We need to do this first as shutting down the log @@ -3879,11 +3880,24 @@ xlog_force_shutdown( spin_lock(&log->l_icloglock); if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { spin_unlock(&log->l_icloglock); - ASSERT(0); return false; } spin_unlock(&log->l_icloglock); + /* + * If this log shutdown also sets the mount shutdown state, issue a + * shutdown warning message. + */ + if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR, +"Filesystem has been shut down due to log error (0x%x).", + shutdown_flags); + xfs_alert(log->l_mp, +"Please unmount the filesystem and rectify the problem(s)."); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); + } + /* * We don't want anybody waiting for log reservations after this. That * means we have to wake up everybody queued up on reserveq as well as diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 796e4464f809..767c386ed4ce 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -540,7 +540,7 @@ xlog_cil_insert_items( spin_unlock(&cil->xc_cil_lock); if (tp->t_ticket->t_curr_res < 0) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } static void @@ -854,7 +854,7 @@ xlog_cil_write_commit_record( error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 7758a6706b8c..c4ad4296c540 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2485,7 +2485,7 @@ xlog_finish_defer_ops( error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); if (error) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -3454,7 +3454,7 @@ xlog_recover_finish( */ xlog_recover_cancel_intents(log); xfs_alert(log->l_mp, "Failed to recover intents"); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } @@ -3501,7 +3501,7 @@ xlog_recover_finish( * end of intents processing can be pushed through the CIL * and AIL. */ - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 29ffa8c42795..c5f153c3693f 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -21,6 +21,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_fsops.h" diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index c2ccb98c7bcd..d3a97a028560 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -873,17 +873,17 @@ xfs_trans_ail_delete( int shutdown_type) { struct xfs_ail *ailp = lip->li_ailp; - struct xfs_mount *mp = ailp->ail_log->l_mp; + struct xlog *log = ailp->ail_log; xfs_lsn_t tail_lsn; spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !xlog_is_shutdown(ailp->ail_log)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + if (shutdown_type && !xlog_is_shutdown(log)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); - xfs_force_shutdown(mp, shutdown_type); + xlog_force_shutdown(log, shutdown_type); } return; } -- cgit v1.2.3 From 41e6362183589afd2cd51d653e277d256daab11f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:01 -0700 Subject: xfs: xfs_do_force_shutdown needs to block racing shutdowns When we call xfs_forced_shutdown(), the caller often expects the filesystem to be completely shut down when it returns. However, if we have racing xfs_forced_shutdown() calls, the first caller sets the mount shutdown flag then goes to shutdown the log. The second caller sees the mount shutdown flag and returns immediately - it does not wait for the log to be shut down. Unfortunately, xfs_forced_shutdown() is used in some places that expect it to completely shut down the filesystem before it returns (e.g. xfs_trans_log_inode()). As such, returning before the log has been shut down leaves us in a place where the transaction failed to complete correctly but we still call xfs_trans_commit(). This situation arises because xfs_trans_log_inode() does not return an error and instead calls xfs_force_shutdown() to ensure that the transaction being committed is aborted. Unfortunately, we have a race condition where xfs_trans_commit() needs to check xlog_is_shutdown() because it can't abort log items before the log is shut down, but it needs to use xfs_is_shutdown() because xfs_forced_shutdown() does not block waiting for the log to shut down. To fix this conundrum, first we make all calls to xfs_forced_shutdown() block until the log is also shut down. This means we can then safely use xfs_forced_shutdown() as a mechanism that ensures the currently running transaction will be aborted by xfs_trans_commit() regardless of the shutdown check it uses. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_fsops.c | 6 +++++- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_log_priv.h | 11 +++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 196e2c51309c..68f74549fa22 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,6 +17,7 @@ #include "xfs_fsops.h" #include "xfs_trans_space.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_trace.h" @@ -518,8 +519,11 @@ xfs_do_force_shutdown( int tag; const char *why; - if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) + + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) { + xlog_shutdown_wait(mp->m_log); return; + } if (mp->m_sb_bp) mp->m_sb_bp->b_flags |= XBF_DONE; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4188ed752169..678ca01047e1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3923,6 +3923,7 @@ xlog_force_shutdown( xlog_state_shutdown_callbacks(log); spin_unlock(&log->l_icloglock); + wake_up_var(&log->l_opstate); return log_error; } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 23103d68423c..cd0508e26fec 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -484,6 +484,17 @@ xlog_is_shutdown(struct xlog *log) return test_bit(XLOG_IO_ERROR, &log->l_opstate); } +/* + * Wait until the xlog_force_shutdown() has marked the log as shut down + * so xlog_is_shutdown() will always return true. + */ +static inline void +xlog_shutdown_wait( + struct xlog *log) +{ + wait_var_event(&log->l_opstate, xlog_is_shutdown(log)); +} + /* common routines */ extern int xlog_recover( -- cgit v1.2.3 From 3c4cb76bce4380aee99c275b3920049350939e47 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:01 -0700 Subject: xfs: xfs_trans_commit() path must check for log shutdown If a shut races with xfs_trans_commit() and we have shut down the filesystem but not the log, we will still cancel the transaction. This can result in aborting dirty log items instead of committing and pinning them whilst the log is still running. Hence we can end up with dirty, unlogged metadata that isn't in the AIL in memory that can be flushed to disk via writeback clustering. This was discovered from a g/388 trace where an inode log item was having IO completed on it and it wasn't in the AIL, hence tripping asserts xfs_ail_check(). Inode cluster writeback started long after the filesystem shutdown started, and long after the transaction containing the dirty inode was aborted and the log item marked XFS_LI_ABORTED. The inode was seen as dirty and unpinned, so it was flushed. IO completion tried to remove the inode from the AIL, at which point stuff went bad: XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/xfs/xfs_fsops.c:500). Shutting down filesystem. XFS: Assertion failed: in_ail, file: fs/xfs/xfs_trans_ail.c, line: 67 XFS (pmem1): Please unmount the filesystem and rectify the problem(s) Workqueue: xfs-buf/pmem1 xfs_buf_ioend_work RIP: 0010:assfail+0x27/0x2d Call Trace: xfs_ail_check+0xa8/0x180 xfs_ail_delete_one+0x3b/0xf0 xfs_buf_inode_iodone+0x329/0x3f0 xfs_buf_ioend+0x1f8/0x530 xfs_buf_ioend_work+0x15/0x20 process_one_work+0x1ac/0x390 worker_thread+0x56/0x3c0 kthread+0xf6/0x120 ret_from_fork+0x1f/0x30 xfs_trans_commit() needs to check log state for shutdown, not mount state. It cannot abort dirty log items while the log is still running as dirty items must remained pinned in memory until they are either committed to the journal or the log has shut down and they can be safely tossed away. Hence if the log has not shut down, the xfs_trans_commit() path must allow completed transactions to commit to the CIL and pin the dirty items even if a mount shutdown has started. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans.c | 48 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 917a69f0a6ff..0ac717aad380 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -836,6 +836,7 @@ __xfs_trans_commit( bool regrant) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; xfs_csn_t commit_seq = 0; int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; @@ -864,7 +865,13 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve; - if (xfs_is_shutdown(mp)) { + /* + * We must check against log shutdown here because we cannot abort log + * items and leave them dirty, inconsistent and unpinned in memory while + * the log is active. This leaves them open to being written back to + * disk, and that will lead to on-disk corruption. + */ + if (xlog_is_shutdown(log)) { error = -EIO; goto out_unreserve; } @@ -878,7 +885,7 @@ __xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant); + xlog_cil_commit(log, tp, &commit_seq, regrant); xfs_trans_free(tp); @@ -905,10 +912,10 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !xlog_is_shutdown(mp->m_log)) - xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); + if (regrant && !xlog_is_shutdown(log)) + xfs_log_ticket_regrant(log, tp->t_ticket); else - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } xfs_trans_free_items(tp, !!error); @@ -926,18 +933,27 @@ xfs_trans_commit( } /* - * Unlock all of the transaction's items and free the transaction. - * The transaction must not have modified any of its items, because - * there is no way to restore them to their previous state. + * Unlock all of the transaction's items and free the transaction. If the + * transaction is dirty, we must shut down the filesystem because there is no + * way to restore them to their previous state. * - * If the transaction has made a log reservation, make sure to release - * it as well. + * If the transaction has made a log reservation, make sure to release it as + * well. + * + * This is a high level function (equivalent to xfs_trans_commit()) and so can + * be called after the transaction has effectively been aborted due to the mount + * being shut down. However, if the mount has not been shut down and the + * transaction is dirty we will shut the mount down and, in doing so, that + * guarantees that the log is shut down, too. Hence we don't need to be as + * careful with shutdown state and dirty items here as we need to be in + * xfs_trans_commit(). */ void xfs_trans_cancel( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; + struct xlog *log = mp->m_log; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); trace_xfs_trans_cancel(tp, _RET_IP_); @@ -955,16 +971,18 @@ xfs_trans_cancel( } /* - * See if the caller is relying on us to shut down the - * filesystem. This happens in paths where we detect - * corruption and decide to give up. + * See if the caller is relying on us to shut down the filesystem. We + * only want an error report if there isn't already a shutdown in + * progress, so we only need to check against the mount shutdown state + * here. */ if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !xfs_is_shutdown(mp)) { + /* Log items need to be consistent until the log is shut down. */ + if (!dirty && !xlog_is_shutdown(log)) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) @@ -975,7 +993,7 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); + xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; } -- cgit v1.2.3 From 5652ef31705f240e1528fe5a45d99229752e1ec8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:02 -0700 Subject: xfs: shutdown during log recovery needs to mark the log shutdown When a checkpoint writeback is run by log recovery, corruption propagated from the log can result in writeback verifiers failing and calling xfs_force_shutdown() from xfs_buf_delwri_submit_buffers(). This results in the mount being marked as shutdown, but the log does not get marked as shut down because: /* * If this happens during log recovery then we aren't using the runtime * log mechanisms yet so there's nothing to shut down. */ if (!log || xlog_in_recovery(log)) return false; If there are other buffers that then fail (say due to detecting the mount shutdown), they will now hang in xfs_do_force_shutdown() waiting for the log to shut down like this: __schedule+0x30d/0x9e0 schedule+0x55/0xd0 xfs_do_force_shutdown+0x1cd/0x200 ? init_wait_var_entry+0x50/0x50 xfs_buf_ioend+0x47e/0x530 __xfs_buf_submit+0xb0/0x240 xfs_buf_delwri_submit_buffers+0xfe/0x270 xfs_buf_delwri_submit+0x3a/0xc0 xlog_do_recovery_pass+0x474/0x7b0 ? do_raw_spin_unlock+0x30/0xb0 xlog_do_log_recovery+0x91/0x140 xlog_do_recover+0x38/0x1e0 xlog_recover+0xdd/0x170 xfs_log_mount+0x17e/0x2e0 xfs_mountfs+0x457/0x930 xfs_fs_fill_super+0x476/0x830 xlog_force_shutdown() always needs to mark the log as shut down, regardless of whether recovery is in progress or not, so that multiple calls to xfs_force_shutdown() during recovery don't end up waiting for the log to be shut down like this. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 678ca01047e1..fc893d370b91 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3847,11 +3847,7 @@ xlog_force_shutdown( { bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); - /* - * If this happens during log recovery then we aren't using the runtime - * log mechanisms yet so there's nothing to shut down. - */ - if (!log || xlog_in_recovery(log)) + if (!log) return false; /* @@ -3860,10 +3856,16 @@ xlog_force_shutdown( * before the force will prevent the log force from flushing the iclogs * to disk. * - * Re-entry due to a log IO error shutdown during the log force is - * prevented by the atomicity of higher level shutdown code. + * When we are in recovery, there are no transactions to flush, and + * we don't want to touch the log because we don't want to perturb the + * current head/tail for future recovery attempts. Hence we need to + * avoid a log force in this case. + * + * If we are shutting down due to a log IO error, then we must avoid + * trying to write the log as that may just result in more IO errors and + * an endless shutdown/force loop. */ - if (!log_error) + if (!log_error && !xlog_in_recovery(log)) xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* -- cgit v1.2.3 From 919edbadebe17a67193533f531c2920c03e40fa4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 29 Mar 2022 18:22:02 -0700 Subject: xfs: drop async cache flushes from CIL commits. Jan Kara reported a performance regression in dbench that he bisected down to commit bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally"). Whilst developing the journal flush/fua optimisations this cache was part of, it appeared to made a significant difference to performance. However, now that this patchset has settled and all the correctness issues fixed, there does not appear to be any significant performance benefit to asynchronous cache flushes. In fact, the opposite is true on some storage types and workloads, where additional cache flushes that can occur from fsync heavy workloads have measurable and significant impact on overall throughput. Local dbench testing shows little difference on dbench runs with sync vs async cache flushes on either fast or slow SSD storage, and no difference in streaming concurrent async transaction workloads like fs-mark. Fast NVME storage. From `dbench -t 30`, CIL scale: clients async sync BW Latency BW Latency 1 935.18 0.855 915.64 0.903 8 2404.51 6.873 2341.77 6.511 16 3003.42 6.460 2931.57 6.529 32 3697.23 7.939 3596.28 7.894 128 7237.43 15.495 7217.74 11.588 512 5079.24 90.587 5167.08 95.822 fsmark, 32 threads, create w/ 64 byte xattr w/32k logbsize create chown unlink async 1m41s 1m16s 2m03s sync 1m40s 1m19s 1m54s Slower SATA SSD storage: From `dbench -t 30`, CIL scale: clients async sync BW Latency BW Latency 1 78.59 15.792 83.78 10.729 8 367.88 92.067 404.63 59.943 16 564.51 72.524 602.71 76.089 32 831.66 105.984 870.26 110.482 128 1659.76 102.969 1624.73 91.356 512 2135.91 223.054 2603.07 161.160 fsmark, 16 threads, create w/32k logbsize create unlink async 5m06s 4m15s sync 5m00s 4m22s And on Jan's test machine: 5.18-rc8-vanilla 5.18-rc8-patched Amean 1 71.22 ( 0.00%) 64.94 * 8.81%* Amean 2 93.03 ( 0.00%) 84.80 * 8.85%* Amean 4 150.54 ( 0.00%) 137.51 * 8.66%* Amean 8 252.53 ( 0.00%) 242.24 * 4.08%* Amean 16 454.13 ( 0.00%) 439.08 * 3.31%* Amean 32 835.24 ( 0.00%) 829.74 * 0.66%* Amean 64 1740.59 ( 0.00%) 1686.73 * 3.09%* Performance and cache flush behaviour is restored to pre-regression levels. As such, we can now consider the async cache flush mechanism an unnecessary exercise in premature optimisation and hence we can now remove it and the infrastructure it requires completely. Fixes: bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally") Reported-and-tested-by: Jan Kara Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bio_io.c | 35 ----------------------------------- fs/xfs/xfs_linux.h | 2 -- fs/xfs/xfs_log.c | 36 +++++++++++------------------------- fs/xfs/xfs_log_cil.c | 42 +++++++++++++----------------------------- fs/xfs/xfs_log_priv.h | 3 +-- 5 files changed, 25 insertions(+), 93 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index 667e297f59b1..17f36db2f792 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count) return bio_max_segs(howmany(count, PAGE_SIZE)); } -static void -xfs_flush_bdev_async_endio( - struct bio *bio) -{ - complete(bio->bi_private); -} - -/* - * Submit a request for an async cache flush to run. If the request queue does - * not require flush operations, just skip it altogether. If the caller needs - * to wait for the flush completion at a later point in time, they must supply a - * valid completion. This will be signalled when the flush completes. The - * caller never sees the bio that is issued here. - */ -void -xfs_flush_bdev_async( - struct bio *bio, - struct block_device *bdev, - struct completion *done) -{ - struct request_queue *q = bdev->bd_disk->queue; - - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - complete(done); - return; - } - - bio_init(bio, NULL, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - bio->bi_private = done; - bio->bi_end_io = xfs_flush_bdev_async_endio; - - submit_bio(bio); -} int xfs_rw_bdev( struct block_device *bdev, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 09a8fba84ff9..cb9105d667db 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); -void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, - struct completion *done); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index fc893d370b91..af22cbae9124 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -527,12 +527,6 @@ xlog_state_shutdown_callbacks( * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * - * If the caller passes in a non-zero @old_tail_lsn and the current log tail - * does not match, there may be metadata on disk that must be persisted before - * this iclog is written. To satisfy that requirement, set the - * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new - * log tail value. - * * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the * log tail is updated correctly. NEED_FUA indicates that the iclog will be * written to stable storage, and implies that a commit record is contained @@ -549,12 +543,10 @@ xlog_state_shutdown_callbacks( * always capture the tail lsn on the iclog on the first NEED_FUA release * regardless of the number of active reference counts on this iclog. */ - int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t old_tail_lsn) + struct xlog_in_core *iclog) { xfs_lsn_t tail_lsn; bool last_ref; @@ -565,18 +557,14 @@ xlog_state_release_iclog( /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does - * not move between deciding if a cache flush is required and writing - * the LSN into the iclog below. + * not move between the first time we know that the iclog needs to be + * made stable and when we eventually submit it. */ - if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || + (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && + !iclog->ic_header.h_tail_lsn) { tail_lsn = xlog_assign_tail_lsn(log->l_mp); - - if (old_tail_lsn && tail_lsn != old_tail_lsn) - iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; - - if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && - !iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); @@ -601,8 +589,6 @@ xlog_state_release_iclog( } iclog->ic_state = XLOG_STATE_SYNCING; - if (!iclog->ic_header.h_tail_lsn) - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog); trace_xlog_iclog_syncing(iclog, _RET_IP_); @@ -874,7 +860,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog); } /* @@ -2412,7 +2398,7 @@ xlog_write_copy_finish( ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || xlog_is_shutdown(log)); release_iclog: - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; } @@ -2629,7 +2615,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; @@ -3053,7 +3039,7 @@ restart: * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); if (error) return error; diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 767c386ed4ce..ba57323bfdce 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state( * The LSN we need to pass to the log items on transaction * commit is the LSN reported by the first log vector write, not * the commit lsn. If we use the commit record lsn then we can - * move the tail beyond the grant write head. + * move the grant write head beyond the tail LSN and overwrite + * it. */ ctx->start_lsn = lsn; wake_up_all(&cil->xc_start_wait); spin_unlock(&cil->xc_push_lock); + + /* + * Make sure the metadata we are about to overwrite in the log + * has been flushed to stable storage before this iclog is + * issued. + */ + spin_lock(&cil->xc_log->l_icloglock); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + spin_unlock(&cil->xc_log->l_icloglock); return; } @@ -888,10 +898,7 @@ xlog_cil_push_work( struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; - xfs_lsn_t preflush_tail_lsn; xfs_csn_t push_seq; - struct bio bio; - DECLARE_COMPLETION_ONSTACK(bdev_flush); bool push_commit_stable; new_ctx = xlog_cil_ctx_alloc(); @@ -961,23 +968,6 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - /* - * The CIL is stable at this point - nothing new will be added to it - * because we hold the flush lock exclusively. Hence we can now issue - * a cache flush to ensure all the completed metadata in the journal we - * are about to overwrite is on stable storage. - * - * Because we are issuing this cache flush before we've written the - * tail lsn to the iclog, we can have metadata IO completions move the - * tail forwards between the completion of this flush and the iclog - * being written. In this case, we need to re-issue the cache flush - * before the iclog write. To detect whether the log tail moves, sample - * the tail LSN *before* we issue the flush. - */ - preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); - xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, - &bdev_flush); - /* * Pull all the log vectors off the items in the CIL, and remove the * items from the CIL. We don't need the CIL lock here because it's only @@ -1054,12 +1044,6 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - /* - * Before we format and submit the first iclog, we have to ensure that - * the metadata writeback ordering cache flush is complete. - */ - wait_for_completion(&bdev_flush); - error = xlog_cil_write_chain(ctx, &lvhdr); if (error) goto out_abort_free_ticket; @@ -1118,7 +1102,7 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ @@ -1139,7 +1123,7 @@ out_abort_free_ticket: return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog, 0); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index cd0508e26fec..401cdc400980 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -535,8 +535,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); -int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, - xfs_lsn_t log_tail_lsn); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); /* * When we crack an atomic LSN, we sample it first so that the value will not -- cgit v1.2.3