summaryrefslogtreecommitdiff
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2023-04-14 00:10:41 +0300
committerDave Chinner <dchinner@redhat.com>2023-04-14 00:10:41 +0300
commit1e7912349ebcc194aba463b2c8128ba809ee4b64 (patch)
tree6d1702bd340430ff01fcb8fefc2635c4f4fca6d9 /fs/xfs
parenta44667226d32a168a0953f9382cd8503ec947d7d (diff)
parent1fc7a0597d237c17b6501f8c33b76d3eaaae9079 (diff)
downloadlinux-1e7912349ebcc194aba463b2c8128ba809ee4b64.tar.xz
Merge tag 'scrub-iget-fixes-6.4_2023-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into guilt/xfs-for-next
xfs: fix iget/irele usage in online fsck [v24.5] This patchset fixes a handful of problems relating to how we get and release incore inodes in the online scrub code. The first patch fixes how we handle DONTCACHE -- our reasons for setting (or clearing it) depend entirely on the runtime environment at irele time. Hence we can refactor iget and irele to use our own wrappers that set that context appropriately. The second patch fixes a race between the iget call in the inode core scrubber and other writer threads that are allocating or freeing inodes in the same AG by changing the behavior of xchk_iget (and the inode core scrub setup function) to return either an incore inode or the AGI buffer so that we can be sure that the inode cannot disappear on us. The final patch elides MMAPLOCK from scrub paths when possible. It did not fit anywhere else. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/scrub/bmap.c9
-rw-r--r--fs/xfs/scrub/common.c300
-rw-r--r--fs/xfs/scrub/common.h10
-rw-r--r--fs/xfs/scrub/dir.c14
-rw-r--r--fs/xfs/scrub/inode.c177
-rw-r--r--fs/xfs/scrub/parent.c13
-rw-r--r--fs/xfs/scrub/scrub.c2
-rw-r--r--fs/xfs/xfs_icache.c3
-rw-r--r--fs/xfs/xfs_icache.h11
9 files changed, 438 insertions, 101 deletions
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 2412dcf0fa9a..e485a546a758 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -34,12 +34,12 @@ xchk_setup_inode_bmap(
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
- error = xchk_get_inode(sc);
+ error = xchk_iget_for_scrubbing(sc);
if (error)
goto out;
- sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);
/*
* We don't want any ephemeral data fork updates sitting around
@@ -50,6 +50,9 @@ xchk_setup_inode_bmap(
sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
+ sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL);
+
inode_dio_wait(VFS_I(sc->ip));
/*
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 813ded91661b..9aa79665c608 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -643,6 +643,14 @@ xchk_ag_init(
/* Per-scrubber setup functions */
+void
+xchk_trans_cancel(
+ struct xfs_scrub *sc)
+{
+ xfs_trans_cancel(sc->tp);
+ sc->tp = NULL;
+}
+
/*
* Grab an empty transaction so that we can re-grab locked buffers if
* one of our btrees turns out to be cyclic.
@@ -718,80 +726,273 @@ xchk_checkpoint_log(
return 0;
}
+/* Verify that an inode is allocated ondisk, then return its cached inode. */
+int
+xchk_iget(
+ struct xfs_scrub *sc,
+ xfs_ino_t inum,
+ struct xfs_inode **ipp)
+{
+ return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
+}
+
+/*
+ * Try to grab an inode in a manner that avoids races with physical inode
+ * allocation. If we can't, return the locked AGI buffer so that the caller
+ * can single-step the loading process to see where things went wrong.
+ * Callers must have a valid scrub transaction.
+ *
+ * If the iget succeeds, return 0, a NULL AGI, and the inode.
+ *
+ * If the iget fails, return the error, the locked AGI, and a NULL inode. This
+ * can include -EINVAL and -ENOENT for invalid inode numbers or inodes that are
+ * no longer allocated; or any other corruption or runtime error.
+ *
+ * If the AGI read fails, return the error, a NULL AGI, and NULL inode.
+ *
+ * If a fatal signal is pending, return -EINTR, a NULL AGI, and a NULL inode.
+ */
+int
+xchk_iget_agi(
+ struct xfs_scrub *sc,
+ xfs_ino_t inum,
+ struct xfs_buf **agi_bpp,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_perag *pag;
+ int error;
+
+ ASSERT(sc->tp != NULL);
+
+again:
+ *agi_bpp = NULL;
+ *ipp = NULL;
+ error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Attach the AGI buffer to the scrub transaction to avoid deadlocks
+ * in the iget cache miss path.
+ */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
+ error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ error = xfs_iget(mp, tp, inum,
+ XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp);
+ if (error == -EAGAIN) {
+ /*
+ * The inode may be in core but temporarily unavailable and may
+ * require the AGI buffer before it can be returned. Drop the
+ * AGI buffer and retry the lookup.
+ *
+ * Incore lookup will fail with EAGAIN on a cache hit if the
+ * inode is queued to the inactivation list. The inactivation
+ * worker may remove the inode from the unlinked list and hence
+ * needs the AGI.
+ *
+ * Hence xchk_iget_agi() needs to drop the AGI lock on EAGAIN
+ * to allow inodegc to make progress and move the inode to
+ * IRECLAIMABLE state where xfs_iget will be able to return it
+ * again if it can lock the inode.
+ */
+ xfs_trans_brelse(tp, *agi_bpp);
+ delay(1);
+ goto again;
+ }
+ if (error)
+ return error;
+
+ /* We got the inode, so we can release the AGI. */
+ ASSERT(*ipp != NULL);
+ xfs_trans_brelse(tp, *agi_bpp);
+ *agi_bpp = NULL;
+ return 0;
+}
+
+/* Install an inode that we opened by handle for scrubbing. */
+int
+xchk_install_handle_inode(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
+ xchk_irele(sc, ip);
+ return -ENOENT;
+ }
+
+ sc->ip = ip;
+ return 0;
+}
+
/*
- * Given an inode and the scrub control structure, grab either the
- * inode referenced in the control structure or the inode passed in.
- * The inode is not locked.
+ * In preparation to scrub metadata structures that hang off of an inode,
+ * grab either the inode referenced in the scrub control structure or the
+ * inode passed in. If the inumber does not reference an allocated inode
+ * record, the function returns ENOENT to end the scrub early. The inode
+ * is not locked.
*/
int
-xchk_get_inode(
+xchk_iget_for_scrubbing(
struct xfs_scrub *sc)
{
struct xfs_imap imap;
struct xfs_mount *mp = sc->mp;
struct xfs_perag *pag;
+ struct xfs_buf *agi_bp;
struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
struct xfs_inode *ip = NULL;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
int error;
+ ASSERT(sc->tp == NULL);
+
/* We want to scan the inode we already had opened. */
if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
sc->ip = ip_in;
return 0;
}
- /* Look up the inode, see if the generation number matches. */
+ /* Reject internal metadata files and obviously bad inode numbers. */
if (xfs_internal_inum(mp, sc->sm->sm_ino))
return -ENOENT;
- error = xfs_iget(mp, NULL, sc->sm->sm_ino,
- XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, &ip);
- switch (error) {
- case -ENOENT:
- /* Inode doesn't exist, just bail out. */
+ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
+ return -ENOENT;
+
+ /* Try a regular untrusted iget. */
+ error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ if (!error)
+ return xchk_install_handle_inode(sc, ip);
+ if (error == -ENOENT)
return error;
- case 0:
- /* Got an inode, continue. */
- break;
- case -EINVAL:
+ if (error != -EINVAL)
+ goto out_error;
+
+ /*
+ * EINVAL with IGET_UNTRUSTED probably means one of several things:
+ * userspace gave us an inode number that doesn't correspond to fs
+ * space; the inode btree lacks a record for this inode; or there is a
+ * record, and it says this inode is free.
+ *
+ * We want to look up this inode in the inobt to distinguish two
+ * scenarios: (1) the inobt says the inode is free, in which case
+ * there's nothing to do; and (2) the inobt says the inode is
+ * allocated, but loading it failed due to corruption.
+ *
+ * Allocate a transaction and grab the AGI to prevent inobt activity
+ * in this AG. Retry the iget in case someone allocated a new inode
+ * after the first iget failed.
+ */
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_error;
+
+ error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
+ if (error == 0) {
+ /* Actually got the inode, so install it. */
+ xchk_trans_cancel(sc);
+ return xchk_install_handle_inode(sc, ip);
+ }
+ if (error == -ENOENT)
+ goto out_gone;
+ if (error != -EINVAL)
+ goto out_cancel;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ error = -ECANCELED;
+ goto out_cancel;
+ }
+
+ /*
+ * Untrusted iget failed a second time. Let's try an inobt lookup.
+ * If the inobt thinks this the inode neither can exist inside the
+ * filesystem nor is allocated, return ENOENT to signal that the check
+ * can be skipped.
+ *
+ * If the lookup returns corruption, we'll mark this inode corrupt and
+ * exit to userspace. There's little chance of fixing anything until
+ * the inobt is straightened out, but there's nothing we can do here.
+ *
+ * If the lookup encounters any other error, exit to userspace.
+ *
+ * If the lookup succeeds, something else must be very wrong in the fs
+ * such that setting up the incore inode failed in some strange way.
+ * Treat those as corruptions.
+ */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
+ if (!pag) {
+ error = -EFSCORRUPTED;
+ goto out_cancel;
+ }
+
+ error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
+ XFS_IGET_UNTRUSTED);
+ xfs_perag_put(pag);
+ if (error == -EINVAL || error == -ENOENT)
+ goto out_gone;
+ if (!error)
+ error = -EFSCORRUPTED;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_error:
+ trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+ error, __return_address);
+ return error;
+out_gone:
+ /* The file is gone, so there's nothing to check. */
+ xchk_trans_cancel(sc);
+ return -ENOENT;
+}
+
+/* Release an inode, possibly dropping it in the process. */
+void
+xchk_irele(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (current->journal_info != NULL) {
+ ASSERT(current->journal_info == sc->tp);
+
/*
- * -EINVAL with IGET_UNTRUSTED could mean one of several
- * things: userspace gave us an inode number that doesn't
- * correspond to fs space, or doesn't have an inobt entry;
- * or it could simply mean that the inode buffer failed the
- * read verifiers.
+ * If we are in a transaction, we /cannot/ drop the inode
+ * ourselves, because the VFS will trigger writeback, which
+ * can require a transaction. Clear DONTCACHE to force the
+ * inode to the LRU, where someone else can take care of
+ * dropping it.
*
- * Try just the inode mapping lookup -- if it succeeds, then
- * the inode buffer verifier failed and something needs fixing.
- * Otherwise, we really couldn't find it so tell userspace
- * that it no longer exists.
+ * Note that when we grabbed our reference to the inode, it
+ * could have had an active ref and DONTCACHE set if a sysadmin
+ * is trying to coerce a change in file access mode. icache
+ * hits do not clear DONTCACHE, so we must do it here.
*/
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
- if (pag) {
- error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
- XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE);
- xfs_perag_put(pag);
- if (error)
- return -ENOENT;
- }
- error = -EFSCORRUPTED;
- fallthrough;
- default:
- trace_xchk_op_error(sc,
- XFS_INO_TO_AGNO(mp, sc->sm->sm_ino),
- XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
- error, __return_address);
- return error;
- }
- if (VFS_I(ip)->i_generation != sc->sm->sm_gen) {
- xfs_irele(ip);
- return -ENOENT;
+ spin_lock(&VFS_I(ip)->i_lock);
+ VFS_I(ip)->i_state &= ~I_DONTCACHE;
+ spin_unlock(&VFS_I(ip)->i_lock);
+ } else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
+ /*
+ * If this is the last reference to the inode and the caller
+ * permits it, set DONTCACHE to avoid thrashing.
+ */
+ d_mark_dontcache(VFS_I(ip));
}
- sc->ip = ip;
- return 0;
+ xfs_irele(ip);
}
-/* Set us up to scrub a file's contents. */
+/*
+ * Set us up to scrub metadata mapped by a file's fork. Callers must not use
+ * this to operate on user-accessible regular file data because the MMAPLOCK is
+ * not taken.
+ */
int
xchk_setup_inode_contents(
struct xfs_scrub *sc,
@@ -799,13 +1000,14 @@ xchk_setup_inode_contents(
{
int error;
- error = xchk_get_inode(sc);
+ error = xchk_iget_for_scrubbing(sc);
if (error)
return error;
- /* Got the inode, lock it and we're ready to go. */
- sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ /* Lock the inode so the VFS cannot touch this file. */
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
+
error = xchk_trans_alloc(sc, resblks);
if (error)
goto out;
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 544f86ff8d1d..18b5f2b62f13 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -32,6 +32,8 @@ xchk_should_terminate(
}
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+void xchk_trans_cancel(struct xfs_scrub *sc);
+
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
bool xchk_fblock_process_error(struct xfs_scrub *sc, int whichfork,
@@ -133,10 +135,16 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
const struct xfs_owner_info *oinfo, xfs_filblks_t *blocks);
int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
-int xchk_get_inode(struct xfs_scrub *sc);
+int xchk_iget_for_scrubbing(struct xfs_scrub *sc);
int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
+int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
+int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
+ struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
+void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
+int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
+
/*
* Don't bother cross-referencing if we already found corruption or cross
* referencing discrepancies.
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 6404201d3d36..0b491784b759 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -117,21 +117,15 @@ xchk_dir_actor(
}
/*
- * Grab the inode pointed to by the dirent. We release the
- * inode before we cancel the scrub transaction. Since we're
- * don't know a priori that releasing the inode won't trigger
- * eofblocks cleanup (which allocates what would be a nested
- * transaction), we can't use DONTCACHE here because DONTCACHE
- * inodes can trigger immediate inactive cleanup of the inode.
- * Use UNTRUSTED here to check the allocation status of the inode in
- * the inode btrees.
+ * Grab the inode pointed to by the dirent. We release the inode
+ * before we cancel the scrub transaction.
*
* If _iget returns -EINVAL or -ENOENT then the child inode number is
* garbage and the directory is corrupt. If the _iget returns
* -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
* cross referencing error. Any other error is an operational error.
*/
- error = xfs_iget(mp, sc->tp, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+ error = xchk_iget(sc, ino, &ip);
if (error == -EINVAL || error == -ENOENT) {
error = -EFSCORRUPTED;
xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
@@ -141,7 +135,7 @@ xchk_dir_actor(
goto out;
xchk_dir_check_ftype(sc, offset, ip, name->type);
- xfs_irele(ip);
+ xchk_irele(sc, ip);
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return -ECANCELED;
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 2db96c8a71dc..3e1e02e340a6 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -11,8 +11,11 @@
#include "xfs_mount.h"
#include "xfs_btree.h"
#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_ag.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
+#include "xfs_icache.h"
#include "xfs_da_format.h"
#include "xfs_reflink.h"
#include "xfs_rmap.h"
@@ -20,48 +23,176 @@
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "scrub/trace.h"
+
+/* Prepare the attached inode for scrubbing. */
+static inline int
+xchk_prepare_iscrub(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ sc->ilock_flags = XFS_IOLOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ sc->ilock_flags |= XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/* Install this scrub-by-handle inode and prepare it for scrubbing. */
+static inline int
+xchk_install_handle_iscrub(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ int error;
+
+ error = xchk_install_handle_inode(sc, ip);
+ if (error)
+ return error;
+
+ return xchk_prepare_iscrub(sc);
+}
/*
- * Grab total control of the inode metadata. It doesn't matter here if
- * the file data is still changing; exclusive access to the metadata is
- * the goal.
+ * Grab total control of the inode metadata. In the best case, we grab the
+ * incore inode and take all locks on it. If the incore inode cannot be
+ * constructed due to corruption problems, lock the AGI so that we can single
+ * step the loading process to fix everything that can go wrong.
*/
int
xchk_setup_inode(
struct xfs_scrub *sc)
{
+ struct xfs_imap imap;
+ struct xfs_inode *ip;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
+ struct xfs_buf *agi_bp;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, sc->sm->sm_ino);
int error;
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+ /* We want to scan the opened inode, so lock it and exit. */
+ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
+ sc->ip = ip_in;
+ return xchk_prepare_iscrub(sc);
+ }
+
+ /* Reject internal metadata files and obviously bad inode numbers. */
+ if (xfs_internal_inum(mp, sc->sm->sm_ino))
+ return -ENOENT;
+ if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
+ return -ENOENT;
+
+ /* Try a regular untrusted iget. */
+ error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ if (!error)
+ return xchk_install_handle_iscrub(sc, ip);
+ if (error == -ENOENT)
+ return error;
+ if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
+ goto out_error;
+
/*
- * Try to get the inode. If the verifiers fail, we try again
- * in raw mode.
+ * EINVAL with IGET_UNTRUSTED probably means one of several things:
+ * userspace gave us an inode number that doesn't correspond to fs
+ * space; the inode btree lacks a record for this inode; or there is
+ * a record, and it says this inode is free.
+ *
+ * EFSCORRUPTED/EFSBADCRC could mean that the inode was mappable, but
+ * some other metadata corruption (e.g. inode forks) prevented
+ * instantiation of the incore inode. Or it could mean the inobt is
+ * corrupt.
+ *
+ * We want to look up this inode in the inobt directly to distinguish
+ * three different scenarios: (1) the inobt says the inode is free,
+ * in which case there's nothing to do; (2) the inobt is corrupt so we
+ * should flag the corruption and exit to userspace to let it fix the
+ * inobt; and (3) the inobt says the inode is allocated, but loading it
+ * failed due to corruption.
+ *
+ * Allocate a transaction and grab the AGI to prevent inobt activity in
+ * this AG. Retry the iget in case someone allocated a new inode after
+ * the first iget failed.
*/
- error = xchk_get_inode(sc);
- switch (error) {
- case 0:
- break;
- case -EFSCORRUPTED:
- case -EFSBADCRC:
- return xchk_trans_alloc(sc, 0);
- default:
- return error;
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ goto out_error;
+
+ error = xchk_iget_agi(sc, sc->sm->sm_ino, &agi_bp, &ip);
+ if (error == 0) {
+ /* Actually got the incore inode, so install it and proceed. */
+ xchk_trans_cancel(sc);
+ return xchk_install_handle_iscrub(sc, ip);
+ }
+ if (error == -ENOENT)
+ goto out_gone;
+ if (error != -EFSCORRUPTED && error != -EFSBADCRC && error != -EINVAL)
+ goto out_cancel;
+
+ /* Ensure that we have protected against inode allocation/freeing. */
+ if (agi_bp == NULL) {
+ ASSERT(agi_bp != NULL);
+ error = -ECANCELED;
+ goto out_cancel;
}
- /* Got the inode, lock it and we're ready to go. */
- sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
- error = xchk_trans_alloc(sc, 0);
+ /*
+ * Untrusted iget failed a second time. Let's try an inobt lookup.
+ * If the inobt doesn't think this is an allocated inode then we'll
+ * return ENOENT to signal that the check can be skipped.
+ *
+ * If the lookup signals corruption, we'll mark this inode corrupt and
+ * exit to userspace. There's little chance of fixing anything until
+ * the inobt is straightened out, but there's nothing we can do here.
+ *
+ * If the lookup encounters a runtime error, exit to userspace.
+ */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sc->sm->sm_ino));
+ if (!pag) {
+ error = -EFSCORRUPTED;
+ goto out_cancel;
+ }
+
+ error = xfs_imap(pag, sc->tp, sc->sm->sm_ino, &imap,
+ XFS_IGET_UNTRUSTED);
+ xfs_perag_put(pag);
+ if (error == -EINVAL || error == -ENOENT)
+ goto out_gone;
if (error)
- goto out;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ goto out_cancel;
-out:
- /* scrub teardown will unlock and release the inode for us */
+ /*
+ * The lookup succeeded. Chances are the ondisk inode is corrupt and
+ * preventing iget from reading it. Retain the scrub transaction and
+ * the AGI buffer to prevent anyone from allocating or freeing inodes.
+ * This ensures that we preserve the inconsistency between the inobt
+ * saying the inode is allocated and the icache being unable to load
+ * the inode until we can flag the corruption in xchk_inode. The
+ * scrub function has to note the corruption, since we're not really
+ * supposed to do that from the setup function.
+ */
+ return 0;
+
+out_cancel:
+ xchk_trans_cancel(sc);
+out_error:
+ trace_xchk_op_error(sc, agno, XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino),
+ error, __return_address);
return error;
+out_gone:
+ /* The file is gone, so there's nothing to check. */
+ xchk_trans_cancel(sc);
+ return -ENOENT;
}
/* Inode core */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index b6c8f6dccc8f..58d5dfb7ea21 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -127,20 +127,15 @@ xchk_parent_validate(
expected_nlink = VFS_I(sc->ip)->i_nlink == 0 ? 0 : 1;
/*
- * Grab this parent inode. We release the inode before we
- * cancel the scrub transaction. Since we're don't know a
- * priori that releasing the inode won't trigger eofblocks
- * cleanup (which allocates what would be a nested transaction)
- * if the parent pointer erroneously points to a file, we
- * can't use DONTCACHE here because DONTCACHE inodes can trigger
- * immediate inactive cleanup of the inode.
+ * Grab the parent directory inode. This must be released before we
+ * cancel the scrub transaction.
*
* If _iget returns -EINVAL or -ENOENT then the parent inode number is
* garbage and the directory is corrupt. If the _iget returns
* -EFSCORRUPTED or -EFSBADCRC then the parent is corrupt which is a
* cross referencing error. Any other error is an operational error.
*/
- error = xfs_iget(mp, sc->tp, parent_ino, XFS_IGET_UNTRUSTED, 0, &dp);
+ error = xchk_iget(sc, parent_ino, &dp);
if (error == -EINVAL || error == -ENOENT) {
error = -EFSCORRUPTED;
xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
@@ -176,7 +171,7 @@ xchk_parent_validate(
out_unlock:
xfs_iunlock(dp, lock_mode);
out_rele:
- xfs_irele(dp);
+ xchk_irele(sc, dp);
return error;
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 787a9096ddef..03ec455318f4 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -181,7 +181,7 @@ xchk_teardown(
xfs_iunlock(sc->ip, sc->ilock_flags);
if (sc->ip != ip_in &&
!xfs_internal_inum(sc->mp, sc->ip->i_ino))
- xfs_irele(sc->ip);
+ xchk_irele(sc, sc->ip);
sc->ip = NULL;
}
if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c9a7e270a428..351849fc18ff 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -767,7 +767,8 @@ again:
return 0;
out_error_or_again:
- if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
+ if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
+ error == -EAGAIN) {
delay(1);
goto again;
}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 6cd180721659..87910191a9dd 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -34,10 +34,13 @@ struct xfs_icwalk {
/*
* Flags for xfs_iget()
*/
-#define XFS_IGET_CREATE 0x1
-#define XFS_IGET_UNTRUSTED 0x2
-#define XFS_IGET_DONTCACHE 0x4
-#define XFS_IGET_INCORE 0x8 /* don't read from disk or reinit */
+#define XFS_IGET_CREATE (1U << 0)
+#define XFS_IGET_UNTRUSTED (1U << 1)
+#define XFS_IGET_DONTCACHE (1U << 2)
+/* don't read from disk or reinit */
+#define XFS_IGET_INCORE (1U << 3)
+/* Return -EAGAIN immediately if the inode is unavailable. */
+#define XFS_IGET_NORETRY (1U << 4)
int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
uint flags, uint lock_flags, xfs_inode_t **ipp);