diff options
author | Maarten Lankhorst <maarten.lankhorst@linux.intel.com> | 2018-11-13 12:58:49 +0300 |
---|---|---|
committer | Maarten Lankhorst <maarten.lankhorst@linux.intel.com> | 2018-11-13 12:59:10 +0300 |
commit | 0ea0397a3a12f9720d6acb78a48da796a54e81aa (patch) | |
tree | e2078363019cb2fde493844d7f25293412c4b6df /fs/xfs | |
parent | 913240696ec64a2af2de250b1591cc45b5cea2f6 (diff) | |
parent | 651022382c7f8da46cb4872a545ee1da6d097d2a (diff) | |
download | linux-0ea0397a3a12f9720d6acb78a48da796a54e81aa.tar.xz |
Merge remote-tracking branch 'drm/drm-next' into drm-misc-next
drm-next is forwarded to v4.20-rc1, and we need this to make
a patch series apply.
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Diffstat (limited to 'fs/xfs')
36 files changed, 1040 insertions, 746 deletions
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 1e671d4eb6fa..844ed87b1900 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -191,6 +191,128 @@ xfs_attr_calc_size( return nblks; } +STATIC int +xfs_attr_try_sf_addname( + struct xfs_inode *dp, + struct xfs_da_args *args) +{ + + struct xfs_mount *mp = dp->i_mount; + int error, error2; + + error = xfs_attr_shortform_addname(args); + if (error == -ENOSPC) + return error; + + /* + * Commit the shortform mods, and we're done. + * NOTE: this is also the error path (EEXIST, etc). + */ + if (!error && (args->flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(args->trans); + + error2 = xfs_trans_commit(args->trans); + args->trans = NULL; + return error ? error : error2; +} + +/* + * Set the attribute specified in @args. + */ +int +xfs_attr_set_args( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) +{ + struct xfs_inode *dp = args->dp; + int error; + + /* + * If the attribute list is non-existent or a shortform list, + * upgrade it to a single-leaf-block attribute list. + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { + + /* + * Build initial attribute list (if required). + */ + if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) + xfs_attr_shortform_create(args); + + /* + * Try to add the attr to the attribute list in the inode. + */ + error = xfs_attr_try_sf_addname(dp, args); + if (error != -ENOSPC) + return error; + + /* + * It won't fit in the shortform, transform to a leaf block. + * GROT: another possible req'mt for a double-split btree op. + */ + error = xfs_attr_shortform_to_leaf(args, leaf_bp); + if (error) + return error; + + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args->trans, *leaf_bp); + + error = xfs_defer_finish(&args->trans); + if (error) + return error; + + /* + * Commit the leaf transformation. We'll need another + * (linked) transaction to add the new attribute to the + * leaf. + */ + error = xfs_trans_roll_inode(&args->trans, dp); + if (error) + return error; + xfs_trans_bjoin(args->trans, *leaf_bp); + *leaf_bp = NULL; + } + + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) + error = xfs_attr_leaf_addname(args); + else + error = xfs_attr_node_addname(args); + return error; +} + +/* + * Remove the attribute specified in @args. + */ +int +xfs_attr_remove_args( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + int error; + + if (!xfs_inode_hasattr(dp)) { + error = -ENOATTR; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); + error = xfs_attr_shortform_remove(args); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_removename(args); + } else { + error = xfs_attr_node_removename(args); + } + + return error; +} + int xfs_attr_set( struct xfs_inode *dp, @@ -204,7 +326,7 @@ xfs_attr_set( struct xfs_da_args args; struct xfs_trans_res tres; int rsvd = (flags & ATTR_ROOT) != 0; - int error, err2, local; + int error, local; XFS_STATS_INC(mp, xs_attr_set); @@ -255,93 +377,17 @@ xfs_attr_set( error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans); - return error; - } + if (error) + goto out_trans_cancel; xfs_trans_ijoin(args.trans, dp, 0); - - /* - * If the attribute list is non-existent or a shortform list, - * upgrade it to a single-leaf-block attribute list. - */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - - /* - * Build initial attribute list (if required). - */ - if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) - xfs_attr_shortform_create(&args); - - /* - * Try to add the attr to the attribute list in - * the inode. - */ - error = xfs_attr_shortform_addname(&args); - if (error != -ENOSPC) { - /* - * Commit the shortform mods, and we're done. - * NOTE: this is also the error path (EEXIST, etc). - */ - ASSERT(args.trans != NULL); - - /* - * If this is a synchronous mount, make sure that - * the transaction goes to disk before returning - * to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(args.trans); - - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_trans_ichgtime(args.trans, dp, - XFS_ICHGTIME_CHG); - } - err2 = xfs_trans_commit(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - return error ? error : err2; - } - - /* - * It won't fit in the shortform, transform to a leaf block. - * GROT: another possible req'mt for a double-split btree op. - */ - error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); - if (error) - goto out; - /* - * Prevent the leaf buffer from being unlocked so that a - * concurrent AIL push cannot grab the half-baked leaf - * buffer and run into problems with the write verifier. - */ - xfs_trans_bhold(args.trans, leaf_bp); - error = xfs_defer_finish(&args.trans); - if (error) - goto out; - - /* - * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf, which - * means that we have to hold & join the leaf buffer here too. - */ - error = xfs_trans_roll_inode(&args.trans, dp); - if (error) - goto out; - xfs_trans_bjoin(args.trans, leaf_bp); - leaf_bp = NULL; - } - - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) - error = xfs_attr_leaf_addname(&args); - else - error = xfs_attr_node_addname(&args); + error = xfs_attr_set_args(&args, &leaf_bp); if (error) - goto out; + goto out_release_leaf; + if (!args.trans) { + /* shortform attribute has already been committed */ + goto out_unlock; + } /* * If this is a synchronous mount, make sure that the @@ -358,17 +404,17 @@ xfs_attr_set( */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); error = xfs_trans_commit(args.trans); +out_unlock: xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; -out: +out_release_leaf: if (leaf_bp) xfs_trans_brelse(args.trans, leaf_bp); +out_trans_cancel: if (args.trans) xfs_trans_cancel(args.trans); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return error; + goto out_unlock; } /* @@ -423,17 +469,7 @@ xfs_attr_remove( */ xfs_trans_ijoin(args.trans, dp, 0); - if (!xfs_inode_hasattr(dp)) { - error = -ENOATTR; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); - error = xfs_attr_shortform_remove(&args); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_removename(&args); - } else { - error = xfs_attr_node_removename(&args); - } - + error = xfs_attr_remove_args(&args); if (error) goto out; @@ -587,7 +623,7 @@ xfs_attr_leaf_addname( */ error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -675,7 +711,7 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -693,9 +729,6 @@ xfs_attr_leaf_addname( error = xfs_attr3_leaf_clearflag(args); } return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -738,15 +771,12 @@ xfs_attr_leaf_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -864,7 +894,7 @@ restart: state = NULL; error = xfs_attr3_leaf_to_node(args); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -888,7 +918,7 @@ restart: */ error = xfs_da3_split(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -984,7 +1014,7 @@ restart: if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1013,9 +1043,6 @@ out: if (error) return error; return retval; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* @@ -1107,7 +1134,7 @@ xfs_attr_node_removename( if (retval && (state->path.active > 1)) { error = xfs_da3_join(state); if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1138,7 +1165,7 @@ xfs_attr_node_removename( error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (error) - goto out_defer_cancel; + goto out; error = xfs_defer_finish(&args->trans); if (error) goto out; @@ -1150,9 +1177,6 @@ xfs_attr_node_removename( out: xfs_da_state_free(state); return error; -out_defer_cancel: - xfs_defer_cancel(args->trans); - goto out; } /* diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 033ff8c478e2..bdf52a333f3f 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -140,7 +140,9 @@ int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, unsigned char *value, int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, unsigned char *value, int valuelen, int flags); +int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp); int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, int flags, struct attrlist_cursor_kern *cursor); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index af094063e402..d89363c6b523 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -485,7 +485,7 @@ xfs_attr_rmtval_set( blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map, &nmap); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -553,9 +553,6 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } /* @@ -625,7 +622,7 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, &done); if (error) - goto out_defer_cancel; + return error; error = xfs_defer_finish(&args->trans); if (error) return error; @@ -638,7 +635,4 @@ xfs_attr_rmtval_remove( return error; } return 0; -out_defer_cancel: - xfs_defer_cancel(args->trans); - return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 2760314fdf7f..74d7228e755b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -673,7 +673,8 @@ xfs_bmap_extents_to_btree( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); /* - * Make space in the inode incore. + * Make space in the inode incore. This needs to be undone if we fail + * to expand the root. */ xfs_iroot_realloc(ip, 1, whichfork); ifp->if_flags |= XFS_IFBROOT; @@ -711,16 +712,15 @@ xfs_bmap_extents_to_btree( args.minlen = args.maxlen = args.prod = 1; args.wasdel = wasdel; *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - ASSERT(ifp->if_broot == NULL); - goto err1; - } + error = xfs_alloc_vextent(&args); + if (error) + goto out_root_realloc; if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { - ASSERT(ifp->if_broot == NULL); error = -ENOSPC; - goto err1; + goto out_root_realloc; } + /* * Allocation can't fail, the space was reserved. */ @@ -732,9 +732,10 @@ xfs_bmap_extents_to_btree( xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); if (!abp) { - error = -ENOSPC; - goto err2; + error = -EFSCORRUPTED; + goto out_unreserve_dquot; } + /* * Fill in the child block. */ @@ -775,11 +776,12 @@ xfs_bmap_extents_to_btree( *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); return 0; -err2: +out_unreserve_dquot: xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); -err1: +out_root_realloc: xfs_iroot_realloc(ip, -1, whichfork); XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + ASSERT(ifp->if_broot == NULL); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; @@ -1017,6 +1019,34 @@ xfs_bmap_add_attrfork_local( return -EFSCORRUPTED; } +/* Set an inode attr fork off based on the format */ +int +xfs_bmap_set_attrforkoff( + struct xfs_inode *ip, + int size, + int *version) +{ + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version) + *version = 2; + break; + default: + ASSERT(0); + return -EINVAL; + } + + return 0; +} + /* * Convert inode from non-attributed to attributed. * Must not be in a transaction, ip must not be locked. @@ -1068,26 +1098,9 @@ xfs_bmap_add_attrfork( xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = -EINVAL; + error = xfs_bmap_set_attrforkoff(ip, size, &version); + if (error) goto trans_cancel; - } - ASSERT(ip->i_afp == NULL); ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); ip->i_afp->if_flags = XFS_IFEXTENTS; @@ -4079,8 +4092,7 @@ xfs_bmapi_allocate( * extents to real extents when we're about to write the data. */ if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) && - (bma->flags & XFS_BMAPI_PREALLOC) && - xfs_sb_version_hasextflgbit(&mp->m_sb)) + (bma->flags & XFS_BMAPI_PREALLOC)) bma->got.br_state = XFS_EXT_UNWRITTEN; if (bma->wasdel) @@ -5243,8 +5255,7 @@ __xfs_bunmapi( * unmapping part of it. But we can't really * get rid of part of a realtime extent. */ - if (del.br_state == XFS_EXT_UNWRITTEN || - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + if (del.br_state == XFS_EXT_UNWRITTEN) { /* * This piece is unwritten, or we're not * using unwritten extents. Skip over it. @@ -5294,10 +5305,9 @@ __xfs_bunmapi( del.br_blockcount -= mod; del.br_startoff += mod; del.br_startblock += mod; - } else if ((del.br_startoff == start && - (del.br_state == XFS_EXT_UNWRITTEN || - tp->t_blk_res == 0)) || - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + } else if (del.br_startoff == start && + (del.br_state == XFS_EXT_UNWRITTEN || + tp->t_blk_res == 0)) { /* * Can't make it unwritten. There isn't * a full extent here so just skip it. @@ -6112,11 +6122,7 @@ xfs_bmap_validate_extent( XFS_FSB_TO_AGNO(mp, endfsb)) return __this_address; } - if (irec->br_state != XFS_EXT_NORM) { - if (whichfork != XFS_DATA_FORK) - return __this_address; - if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) - return __this_address; - } + if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK) + return __this_address; return NULL; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b6e9b639e731..488dc8860fd7 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -183,6 +183,7 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_owner_info *oinfo, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 059bc44c27e8..9995d5ae380b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -287,6 +287,8 @@ static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) { if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) + return false; /* check for unknown features in the fs */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || @@ -357,12 +359,6 @@ static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); -} - static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) { return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); @@ -1016,6 +1012,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ #define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ +/* Do not use bit 15, di_flags is legacy and unchanging now */ + #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 30d1d60f1d46..09d9c8cfa4a0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -415,6 +415,31 @@ xfs_dinode_verify_fork( return NULL; } +static xfs_failaddr_t +xfs_dinode_verify_forkoff( + struct xfs_dinode *dip, + struct xfs_mount *mp) +{ + if (!XFS_DFORK_Q(dip)) + return NULL; + + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3)) + return __this_address; + break; + case XFS_DINODE_FMT_LOCAL: /* fall through ... */ + case XFS_DINODE_FMT_EXTENTS: /* fall through ... */ + case XFS_DINODE_FMT_BTREE: + if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3)) + return __this_address; + break; + default: + return __this_address; + } + return NULL; +} + xfs_failaddr_t xfs_dinode_verify( struct xfs_mount *mp, @@ -470,6 +495,11 @@ xfs_dinode_verify( if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) return __this_address; + /* check for illegal values of forkoff */ + fa = xfs_dinode_verify_forkoff(dip, mp); + if (fa) + return fa; + /* Do we have appropriate data fork formats for the mode? */ switch (mode & S_IFMT) { case S_IFIFO: diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 081f46e30556..b5a82acd7dfe 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -1115,7 +1115,8 @@ xfs_fs_geometry( geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | - XFS_FSOP_GEOM_FLAGS_DIRV2; + XFS_FSOP_GEOM_FLAGS_DIRV2 | + XFS_FSOP_GEOM_FLAGS_EXTFLG; if (xfs_sb_version_hasattr(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_sb_version_hasquota(sbp)) @@ -1124,8 +1125,6 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; if (xfs_sb_version_hasdalign(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; - if (xfs_sb_version_hasextflgbit(sbp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG; if (xfs_sb_version_hassector(sbp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; if (xfs_sb_version_hasasciici(sbp)) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 036b5c7021eb..376bcb585ae6 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -17,7 +17,6 @@ #include "xfs_sb.h" #include "xfs_alloc.h" #include "xfs_rmap.h" -#include "xfs_alloc.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 5b3b177c0fc9..e386c9b0b4ab 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -126,6 +126,7 @@ xchk_inode_flags( { struct xfs_mount *mp = sc->mp; + /* di_flags are all taken, last bit cannot be used */ if (flags & ~XFS_DIFLAG_ANY) goto bad; @@ -172,8 +173,9 @@ xchk_inode_flags2( { struct xfs_mount *mp = sc->mp; + /* Unknown di_flags2 could be from a future kernel */ if (flags2 & ~XFS_DIFLAG2_ANY) - goto bad; + xchk_ino_set_warning(sc, ino); /* reflink flag requires reflink feature */ if ((flags2 & XFS_DIFLAG2_REFLINK) && diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 9f08dd9bf1d5..4fc0a5ea7673 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -29,6 +29,8 @@ #include "xfs_ag_resv.h" #include "xfs_trans_space.h" #include "xfs_quota.h" +#include "xfs_attr.h" +#include "xfs_reflink.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -692,13 +694,14 @@ xrep_findroot_block( struct xrep_find_ag_btree *fab, uint64_t owner, xfs_agblock_t agbno, - bool *found_it) + bool *done_with_block) { struct xfs_mount *mp = ri->sc->mp; struct xfs_buf *bp; struct xfs_btree_block *btblock; xfs_daddr_t daddr; - int error; + int block_level; + int error = 0; daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); @@ -717,36 +720,111 @@ xrep_findroot_block( return error; } + /* + * Read the buffer into memory so that we can see if it's a match for + * our btree type. We have no clue if it is beforehand, and we want to + * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which + * will cause needless disk reads in subsequent calls to this function) + * and logging metadata verifier failures. + * + * Therefore, pass in NULL buffer ops. If the buffer was already in + * memory from some other caller it will already have b_ops assigned. + * If it was in memory from a previous unsuccessful findroot_block + * call, the buffer won't have b_ops but it should be clean and ready + * for us to try to verify if the read call succeeds. The same applies + * if the buffer wasn't in memory at all. + * + * Note: If we never match a btree type with this buffer, it will be + * left in memory with NULL b_ops. This shouldn't be a problem unless + * the buffer gets written. + */ error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, mp->m_bsize, 0, &bp, NULL); if (error) return error; - /* - * Does this look like a block matching our fs and higher than any - * other block we've found so far? If so, reattach buffer verifiers - * so the AIL won't complain if the buffer is also dirty. - */ + /* Ensure the block magic matches the btree type we're looking for. */ btblock = XFS_BUF_TO_BLOCK(bp); if (be32_to_cpu(btblock->bb_magic) != fab->magic) goto out; - if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - goto out; - bp->b_ops = fab->buf_ops; - /* Ignore this block if it's lower in the tree than we've seen. */ - if (fab->root != NULLAGBLOCK && - xfs_btree_get_level(btblock) < fab->height) - goto out; + /* + * If the buffer already has ops applied and they're not the ones for + * this btree type, we know this block doesn't match the btree and we + * can bail out. + * + * If the buffer ops match ours, someone else has already validated + * the block for us, so we can move on to checking if this is a root + * block candidate. + * + * If the buffer does not have ops, nobody has successfully validated + * the contents and the buffer cannot be dirty. If the magic, uuid, + * and structure match this btree type then we'll move on to checking + * if it's a root block candidate. If there is no match, bail out. + */ + if (bp->b_ops) { + if (bp->b_ops != fab->buf_ops) + goto out; + } else { + ASSERT(!xfs_trans_buf_is_dirty(bp)); + if (!uuid_equal(&btblock->bb_u.s.bb_uuid, + &mp->m_sb.sb_meta_uuid)) + goto out; + fab->buf_ops->verify_read(bp); + if (bp->b_error) { + bp->b_error = 0; + goto out; + } - /* Make sure we pass the verifiers. */ - bp->b_ops->verify_read(bp); - if (bp->b_error) + /* + * Some read verifiers will (re)set b_ops, so we must be + * careful not to blow away any such assignment. + */ + if (!bp->b_ops) + bp->b_ops = fab->buf_ops; + } + + /* + * This block passes the magic/uuid and verifier tests for this btree + * type. We don't need the caller to try the other tree types. + */ + *done_with_block = true; + + /* + * Compare this btree block's level to the height of the current + * candidate root block. + * + * If the level matches the root we found previously, throw away both + * blocks because there can't be two candidate roots. + * + * If level is lower in the tree than the root we found previously, + * ignore this block. + */ + block_level = xfs_btree_get_level(btblock); + if (block_level + 1 == fab->height) { + fab->root = NULLAGBLOCK; goto out; - fab->root = agbno; - fab->height = xfs_btree_get_level(btblock) + 1; - *found_it = true; + } else if (block_level < fab->height) { + goto out; + } + + /* + * This is the highest block in the tree that we've found so far. + * Update the btree height to reflect what we've learned from this + * block. + */ + fab->height = block_level + 1; + + /* + * If this block doesn't have sibling pointers, then it's the new root + * block candidate. Otherwise, the root will be found farther up the + * tree. + */ + if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) && + btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) + fab->root = agbno; + else + fab->root = NULLAGBLOCK; trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); @@ -768,7 +846,7 @@ xrep_findroot_rmap( struct xrep_findroot *ri = priv; struct xrep_find_ag_btree *fab; xfs_agblock_t b; - bool found_it; + bool done; int error = 0; /* Ignore anything that isn't AG metadata. */ @@ -777,16 +855,16 @@ xrep_findroot_rmap( /* Otherwise scan each block + btree type. */ for (b = 0; b < rec->rm_blockcount; b++) { - found_it = false; + done = false; for (fab = ri->btree_info; fab->buf_ops; fab++) { if (rec->rm_owner != fab->rmap_owner) continue; error = xrep_findroot_block(ri, fab, rec->rm_owner, rec->rm_startblock + b, - &found_it); + &done); if (error) return error; - if (found_it) + if (done) break; } } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4bfae1e61d30..1b2344d00525 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -412,19 +412,6 @@ xchk_validate_inputs( goto out; } - error = -EOPNOTSUPP; - /* - * We won't scrub any filesystem that doesn't have the ability - * to record unwritten extents. The option was made default in - * 2003, removed from mkfs in 2007, and cannot be disabled in - * v5, so if we find a filesystem without this flag it's either - * really old or totally unsupported. Avoid it either way. - * We also don't support v1-v3 filesystems, which aren't - * mountable. - */ - if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) - goto out; - /* * We only want to repair read-write v5+ filesystems. Defer the check * for ops->repair until after our scrub confirms that we need to diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 49f5f5896a43..338b9d9984e0 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -917,7 +917,7 @@ xfs_vm_writepage( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_INVALID, + .io_type = XFS_IO_HOLE, }; int ret; @@ -933,7 +933,7 @@ xfs_vm_writepages( struct writeback_control *wbc) { struct xfs_writepage_ctx wpc = { - .io_type = XFS_IO_INVALID, + .io_type = XFS_IO_HOLE, }; int ret; diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 9af867951a10..494b4338446e 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -12,21 +12,19 @@ extern struct bio_set xfs_ioend_bioset; * Types of I/O for bmap clustering and I/O completion tracking. */ enum { - XFS_IO_INVALID, /* initial state */ + XFS_IO_HOLE, /* covers region without any block allocation */ XFS_IO_DELALLOC, /* covers delalloc region */ XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ XFS_IO_OVERWRITE, /* covers already allocated extent */ XFS_IO_COW, /* covers copy-on-write extent */ - XFS_IO_HOLE, /* covers region without any block allocation */ }; #define XFS_IO_TYPES \ - { XFS_IO_INVALID, "invalid" }, \ - { XFS_IO_DELALLOC, "delalloc" }, \ - { XFS_IO_UNWRITTEN, "unwritten" }, \ - { XFS_IO_OVERWRITE, "overwrite" }, \ - { XFS_IO_COW, "CoW" }, \ - { XFS_IO_HOLE, "hole" } + { XFS_IO_HOLE, "hole" }, \ + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" }, \ + { XFS_IO_COW, "CoW" } /* * Structure for buffered I/O completions. diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index addbd74ecd8e..5d263dfdb3bc 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -406,10 +406,10 @@ xfs_getbmap_report_one( struct xfs_bmbt_irec *got) { struct kgetbmap *p = out + bmv->bmv_entries; - bool shared = false, trimmed = false; + bool shared = false; int error; - error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, got, &shared); if (error) return error; @@ -702,13 +702,9 @@ xfs_bmap_punch_delalloc_range( struct xfs_iext_cursor icur; int error = 0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); - if (error) - goto out_unlock; - } + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got)) goto out_unlock; @@ -1047,44 +1043,6 @@ out_trans_cancel: } static int -xfs_adjust_extent_unmap_boundaries( - struct xfs_inode *ip, - xfs_fileoff_t *startoffset_fsb, - xfs_fileoff_t *endoffset_fsb) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_bmbt_irec imap; - int nimap, error; - xfs_extlen_t mod = 0; - - nimap = 1; - error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0); - if (error) - return error; - - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod); - if (mod) - *startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - - nimap = 1; - error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0); - if (error) - return error; - - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && mod != mp->m_sb.sb_rextsize) - *endoffset_fsb -= mod; - } - - return 0; -} - -static int xfs_flush_unmap_range( struct xfs_inode *ip, xfs_off_t offset, @@ -1137,19 +1095,8 @@ xfs_free_file_space( endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); /* - * Need to zero the stuff we're not freeing, on disk. If it's a RT file - * and we can't use unwritten extents then we actually need to ensure - * to zero the whole extent, otherwise we just need to take of block - * boundaries, and xfs_bunmapi will handle the rest. + * Need to zero the stuff we're not freeing, on disk. */ - if (XFS_IS_REALTIME_INODE(ip) && - !xfs_sb_version_hasextflgbit(&mp->m_sb)) { - error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb, - &endoffset_fsb); - if (error) - return error; - } - if (endoffset_fsb > startoffset_fsb) { while (!done) { error = xfs_unmap_extent(ip, startoffset_fsb, @@ -1584,7 +1531,7 @@ xfs_swap_extent_rmap( tirec.br_blockcount, &irec, &nimaps, 0); if (error) - goto out_defer; + goto out; ASSERT(nimaps == 1); ASSERT(tirec.br_startoff == irec.br_startoff); trace_xfs_swap_extent_rmap_remap_piece(ip, &irec); @@ -1599,22 +1546,22 @@ xfs_swap_extent_rmap( /* Remove the mapping from the donor file. */ error = xfs_bmap_unmap_extent(tp, tip, &uirec); if (error) - goto out_defer; + goto out; /* Remove the mapping from the source file. */ error = xfs_bmap_unmap_extent(tp, ip, &irec); if (error) - goto out_defer; + goto out; /* Map the donor file's blocks into the source file. */ error = xfs_bmap_map_extent(tp, ip, &uirec); if (error) - goto out_defer; + goto out; /* Map the source file's blocks into the donor file. */ error = xfs_bmap_map_extent(tp, tip, &irec); if (error) - goto out_defer; + goto out; error = xfs_defer_finish(tpp); tp = *tpp; @@ -1636,8 +1583,6 @@ xfs_swap_extent_rmap( tip->i_d.di_flags2 = tip_flags2; return 0; -out_defer: - xfs_defer_cancel(tp); out: trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_); tip->i_d.di_flags2 = tip_flags2; @@ -1830,6 +1775,12 @@ xfs_swap_extents( if (error) goto out_unlock; + if (xfs_inode_has_cow_data(tip)) { + error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true); + if (error) + return error; + } + /* * Extent "swapping" with rmap requires a permanent reservation and * a block reservation because it's really just a remap operation diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index e839907e8492..b21ea2ba768d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone; #define xb_to_gfp(flags) \ ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) +/* + * Locking orders + * + * xfs_buf_ioacct_inc: + * xfs_buf_ioacct_dec: + * b_sema (caller holds) + * b_lock + * + * xfs_buf_stale: + * b_sema (caller holds) + * b_lock + * lru_lock + * + * xfs_buf_rele: + * b_lock + * pag_buf_lock + * lru_lock + * + * xfs_buftarg_wait_rele + * lru_lock + * b_lock (trylock due to inversion) + * + * xfs_buftarg_isolate + * lru_lock + * b_lock (trylock due to inversion) + */ static inline int xfs_buf_is_vmapped( @@ -749,6 +775,30 @@ _xfs_buf_read( return xfs_buf_submit(bp); } +/* + * If the caller passed in an ops structure and the buffer doesn't have ops + * assigned, set the ops and use them to verify the contents. If the contents + * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no + * recorded errors and is already in XBF_DONE state. + */ +int +xfs_buf_ensure_ops( + struct xfs_buf *bp, + const struct xfs_buf_ops *ops) +{ + ASSERT(bp->b_flags & XBF_DONE); + ASSERT(bp->b_error == 0); + + if (!ops || bp->b_ops) + return 0; + + bp->b_ops = ops; + bp->b_ops->verify_read(bp); + if (bp->b_error) + bp->b_flags &= ~XBF_DONE; + return bp->b_error; +} + xfs_buf_t * xfs_buf_read_map( struct xfs_buftarg *target, @@ -762,26 +812,32 @@ xfs_buf_read_map( flags |= XBF_READ; bp = xfs_buf_get_map(target, map, nmaps, flags); - if (bp) { - trace_xfs_buf_read(bp, flags, _RET_IP_); + if (!bp) + return NULL; - if (!(bp->b_flags & XBF_DONE)) { - XFS_STATS_INC(target->bt_mount, xb_get_read); - bp->b_ops = ops; - _xfs_buf_read(bp, flags); - } else if (flags & XBF_ASYNC) { - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ - xfs_buf_relse(bp); - return NULL; - } else { - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - } + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!(bp->b_flags & XBF_DONE)) { + XFS_STATS_INC(target->bt_mount, xb_get_read); + bp->b_ops = ops; + _xfs_buf_read(bp, flags); + return bp; } + xfs_buf_ensure_ops(bp, ops); + + if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + xfs_buf_relse(bp); + return NULL; + } + + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + ASSERT(bp->b_ops != NULL || ops == NULL); return bp; } @@ -1006,8 +1062,18 @@ xfs_buf_rele( ASSERT(atomic_read(&bp->b_hold) > 0); - release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); + /* + * We grab the b_lock here first to serialise racing xfs_buf_rele() + * calls. The pag_buf_lock being taken on the last reference only + * serialises against racing lookups in xfs_buf_find(). IOWs, the second + * to last reference we drop here is not serialised against the last + * reference until we take bp->b_lock. Hence if we don't grab b_lock + * first, the last "release" reference can win the race to the lock and + * free the buffer before the second-to-last reference is processed, + * leading to a use-after-free scenario. + */ spin_lock(&bp->b_lock); + release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); if (!release) { /* * Drop the in-flight state if the buffer is already on the LRU @@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers( * is only safely useable for callers that can track I/O completion by higher * level means, e.g. AIL pushing as the @buffer_list is consumed in this * function. + * + * Note: this function will skip buffers it would block on, and in doing so + * leaves them on @buffer_list so they can be retried on a later pass. As such, + * it is up to the caller to ensure that the buffer list is fully submitted or + * cancelled appropriately when they are finished with the list. Failure to + * cancel or resubmit the list until it is empty will result in leaked buffers + * at unmount time. */ int xfs_buf_delwri_submit_nowait( diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 4e3171acd0f8..b9f5511ea998 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -385,4 +385,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) +int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops); + #endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1c9d1398980b..12d8455bfbb2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -532,6 +532,49 @@ xfs_buf_item_push( } /* + * Drop the buffer log item refcount and take appropriate action. This helper + * determines whether the bli must be freed or not, since a decrement to zero + * does not necessarily mean the bli is unused. + * + * Return true if the bli is freed, false otherwise. + */ +bool +xfs_buf_item_put( + struct xfs_buf_log_item *bip) +{ + struct xfs_log_item *lip = &bip->bli_item; + bool aborted; + bool dirty; + + /* drop the bli ref and return if it wasn't the last one */ + if (!atomic_dec_and_test(&bip->bli_refcount)) + return false; + + /* + * We dropped the last ref and must free the item if clean or aborted. + * If the bli is dirty and non-aborted, the buffer was clean in the + * transaction but still awaiting writeback from previous changes. In + * that case, the bli is freed on buffer writeback completion. + */ + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || + XFS_FORCED_SHUTDOWN(lip->li_mountp); + dirty = bip->bli_flags & XFS_BLI_DIRTY; + if (dirty && !aborted) + return false; + + /* + * The bli is aborted or clean. An aborted item may be in the AIL + * regardless of dirty state. For example, consider an aborted + * transaction that invalidated a dirty bli and cleared the dirty + * state. + */ + if (aborted) + xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); + xfs_buf_item_relse(bip->bli_buf); + return true; +} + +/* * Release the buffer associated with the buf log item. If there is no dirty * logged data associated with the buffer recorded in the buf log item, then * free the buf log item and remove the reference to it in the buffer. @@ -556,76 +599,42 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool aborted; - bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); - bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); + bool released; + bool hold = bip->bli_flags & XFS_BLI_HOLD; + bool stale = bip->bli_flags & XFS_BLI_STALE; #if defined(DEBUG) || defined(XFS_WARN) - bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); + bool ordered = bip->bli_flags & XFS_BLI_ORDERED; + bool dirty = bip->bli_flags & XFS_BLI_DIRTY; #endif - aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); - - /* Clear the buffer's association with this transaction. */ - bp->b_transp = NULL; - - /* - * The per-transaction state has been copied above so clear it from the - * bli. - */ - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - - /* - * If the buf item is marked stale, then don't do anything. We'll - * unlock the buffer and free the buf item when the buffer is unpinned - * for the last time. - */ - if (bip->bli_flags & XFS_BLI_STALE) { - trace_xfs_buf_item_unlock_stale(bip); - ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - if (!aborted) { - atomic_dec(&bip->bli_refcount); - return; - } - } - trace_xfs_buf_item_unlock(bip); /* - * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. If we are aborting the transaction, this may - * be the only reference to the buf item, so we free it anyway - * regardless of whether it is dirty or not. A dirty abort implies a - * shutdown, anyway. - * * The bli dirty state should match whether the blf has logged segments * except for ordered buffers, where only the bli should be dirty. */ ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || (ordered && dirty && !xfs_buf_item_dirty_format(bip))); + ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); /* - * Clean buffers, by definition, cannot be in the AIL. However, aborted - * buffers may be in the AIL regardless of dirty state. An aborted - * transaction that invalidates a buffer already in the AIL may have - * marked it stale and cleared the dirty state, for example. - * - * Therefore if we are aborting a buffer and we've just taken the last - * reference away, we have to check if it is in the AIL before freeing - * it. We need to free it in this case, because an aborted transaction - * has already shut the filesystem down and this is the last chance we - * will have to do so. + * Clear the buffer's association with this transaction and + * per-transaction state from the bli, which has been copied above. */ - if (atomic_dec_and_test(&bip->bli_refcount)) { - if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!dirty) - xfs_buf_item_relse(bp); - } + bp->b_transp = NULL; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); - if (!hold) - xfs_buf_relse(bp); + /* + * Unref the item and unlock the buffer unless held or stale. Stale + * buffers remain locked until final unpin unless the bli is freed by + * the unref call. The latter implies shutdown because buffer + * invalidation dirties the bli and transaction. + */ + released = xfs_buf_item_put(bip); + if (hold || (stale && !released)) + return; + ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags)); + xfs_buf_relse(bp); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 3f7d7b72e7e6..90f65f891fab 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -51,6 +51,7 @@ struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); +bool xfs_buf_item_put(struct xfs_buf_log_item *); void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint); bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 61a5ad2600e8..53c9ab8fb777 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -919,28 +919,67 @@ out_unlock: return error; } -STATIC int -xfs_file_clone_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) -{ - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); -} -STATIC int -xfs_file_dedupe_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - u64 len) +loff_t +xfs_file_remap_range( + struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + loff_t len, + unsigned int remap_flags) { - return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, true); + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + struct xfs_mount *mp = src->i_mount; + loff_t remapped = 0; + xfs_extlen_t cowextsize; + int ret; + + if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return -EOPNOTSUPP; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* Prepare and then clone file data. */ + ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out, + &len, remap_flags); + if (ret < 0 || len == 0) + return ret; + + trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); + + ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len, + &remapped); + if (ret) + goto out_unlock; + + /* + * Carry the cowextsize hint from src to dest if we're sharing the + * entire source file to the entire destination file, the source file + * has a cowextsize hint, and the destination file does not. + */ + cowextsize = 0; + if (pos_in == 0 && len == i_size_read(inode_in) && + (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && + pos_out == 0 && len >= i_size_read(inode_out) && + !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) + cowextsize = src->i_d.di_cowextsize; + + ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, + remap_flags); + +out_unlock: + xfs_reflink_remap_unlock(file_in, file_out); + if (ret) + trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + return remapped > 0 ? remapped : ret; } STATIC int @@ -1175,8 +1214,7 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .clone_file_range = xfs_file_clone_range, - .dedupe_file_range = xfs_file_dedupe_range, + .remap_file_range = xfs_file_remap_range, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7c00b8bedfe3..093c2b8d7e20 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -470,20 +470,13 @@ xfs_fs_goingdown( */ void xfs_do_force_shutdown( - xfs_mount_t *mp, + struct xfs_mount *mp, int flags, char *fname, int lnnum) { - int logerror; - - logerror = flags & SHUTDOWN_LOG_IO_ERROR; + bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_notice(mp, - "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, - __func__, flags, lnnum, fname, __return_address); - } /* * No need to duplicate efforts. */ @@ -499,27 +492,34 @@ xfs_do_force_shutdown( if (xfs_log_force_umount(mp, logerror)) return; + if (flags & SHUTDOWN_FORCE_UMOUNT) { + xfs_alert(mp, +"User initiated shutdown received. Shutting down filesystem"); + return; + } + + xfs_notice(mp, +"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT, + __func__, flags, lnnum, fname, __return_address); + if (flags & SHUTDOWN_CORRUPT_INCORE) { xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, - "Corruption of in-memory data detected. Shutting down filesystem"); +"Corruption of in-memory data detected. Shutting down filesystem"); if (XFS_ERRLEVEL_HIGH <= xfs_error_level) xfs_stack_trace(); - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, - "Log I/O Error Detected. Shutting down filesystem"); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "All device paths lost. Shutting down filesystem"); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, - "I/O Error Detected. Shutting down filesystem"); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - xfs_alert(mp, - "Please umount the filesystem and rectify the problem(s)"); + } else if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); } + + xfs_alert(mp, + "Please unmount the filesystem and rectify the problem(s)"); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d957a46dc1cb..05db9540e459 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1563,7 +1563,7 @@ xfs_itruncate_extents_flags( error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, XFS_ITRUNC_MAX_EXTENTS, &done); if (error) - goto out_bmap_cancel; + goto out; /* * Duplicate the transaction that has the permanent @@ -1599,14 +1599,6 @@ xfs_itruncate_extents_flags( out: *tpp = tp; return error; -out_bmap_cancel: - /* - * If the bunmapi call encounters an error, return to the caller where - * the transaction can be properly aborted. We just need to make sure - * we're not holding any resources that we were not when we came in. - */ - xfs_defer_cancel(tp); - goto out; } int diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0ef5ece5634c..6e2c08f30f60 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -604,14 +604,6 @@ xfs_ioc_space( uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; - /* - * Only allow the sys admin to reserve space unless - * unwritten extents are enabled. - */ - if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) return -EPERM; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 6320aca39f39..27c93b5f029d 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -62,6 +62,21 @@ xfs_bmbt_to_iomap( iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); } +static void +xfs_hole_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb); + iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb); + iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); + iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); +} + xfs_extlen_t xfs_eof_alignment( struct xfs_inode *ip, @@ -502,6 +517,7 @@ xfs_file_iomap_begin_delay( struct inode *inode, loff_t offset, loff_t count, + unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); @@ -538,15 +554,23 @@ xfs_file_iomap_begin_delay( goto out_unlock; } + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got); - if (!eof && got.br_startoff <= offset_fsb) { - if (xfs_is_reflink_inode(ip)) { - bool shared; + if (eof) + got.br_startoff = end_fsb; /* fake hole until the end */ - end_fsb = min(XFS_B_TO_FSB(mp, offset + count), - maxbytes_fsb); + if (got.br_startoff <= offset_fsb) { + /* + * For reflink files we may need a delalloc reservation when + * overwriting shared extents. This includes zeroing of + * existing extents that contain data. + */ + if (xfs_is_reflink_inode(ip) && + ((flags & IOMAP_WRITE) || + got.br_state != XFS_EXT_UNWRITTEN)) { xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb); - error = xfs_reflink_reserve_cow(ip, &got, &shared); + error = xfs_reflink_reserve_cow(ip, &got); if (error) goto out_unlock; } @@ -555,6 +579,11 @@ xfs_file_iomap_begin_delay( goto done; } + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff); + goto out_unlock; + } + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; @@ -1003,16 +1032,17 @@ xfs_file_iomap_begin( struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb, end_fsb; int nimaps = 1, error = 0; - bool shared = false, trimmed = false; + bool shared = false; unsigned lockmode; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, iomap); + return xfs_file_iomap_begin_delay(inode, offset, length, flags, + iomap); } /* @@ -1038,8 +1068,7 @@ xfs_file_iomap_begin( if (flags & IOMAP_REPORT) { /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, &imap, &shared, - &trimmed); + error = xfs_reflink_trim_around_shared(ip, &imap, &shared); if (error) goto out_unlock; } @@ -1065,7 +1094,7 @@ xfs_file_iomap_begin( if (error) goto out_unlock; } else { - error = xfs_reflink_reserve_cow(ip, &imap, &shared); + error = xfs_reflink_reserve_cow(ip, &imap); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index c3e74f9128e8..f48ffd7a8d3e 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -471,8 +471,18 @@ xfs_vn_get_link_inline( struct inode *inode, struct delayed_call *done) { + char *link; + ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE); - return XFS_I(inode)->i_df.if_u1.if_data; + + /* + * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if + * if_data is junk. + */ + link = XFS_I(inode)->i_df.if_u1.if_data; + if (!link) + return ERR_PTR(-EFSCORRUPTED); + return link; } STATIC int diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a21dc61ec09e..1fc9e9042e0e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1570,16 +1570,6 @@ xlog_find_zeroed( if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); return 0; - } else if (first_cycle != 1) { - /* - * If the cycle of the last block is zero, the cycle of - * the first block must be 1. If it's not, maybe we're - * not looking at a log... Bail out. - */ - xfs_warn(log->l_mp, - "Log inconsistent or not a log (last==0, first!=1)"); - error = -EINVAL; - goto bp_err; } /* we have a partially zeroed log */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 38f405415b88..ecdb086bc23e 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -182,8 +182,7 @@ int xfs_reflink_trim_around_shared( struct xfs_inode *ip, struct xfs_bmbt_irec *irec, - bool *shared, - bool *trimmed) + bool *shared) { xfs_agnumber_t agno; xfs_agblock_t agbno; @@ -209,7 +208,7 @@ xfs_reflink_trim_around_shared( if (error) return error; - *shared = *trimmed = false; + *shared = false; if (fbno == NULLAGBLOCK) { /* No shared blocks at all. */ return 0; @@ -222,8 +221,6 @@ xfs_reflink_trim_around_shared( */ irec->br_blockcount = flen; *shared = true; - if (flen != aglen) - *trimmed = true; return 0; } else { /* @@ -233,7 +230,6 @@ xfs_reflink_trim_around_shared( * start of the shared region. */ irec->br_blockcount = fbno - agbno; - *trimmed = true; return 0; } } @@ -241,7 +237,7 @@ xfs_reflink_trim_around_shared( /* * Trim the passed in imap to the next shared/unshared extent boundary, and * if imap->br_startoff points to a shared extent reserve space for it in the - * COW fork. In this case *shared is set to true, else to false. + * COW fork. * * Note that imap will always contain the block numbers for the existing blocks * in the data fork, as the upper layers need them for read-modify-write @@ -250,14 +246,14 @@ xfs_reflink_trim_around_shared( int xfs_reflink_reserve_cow( struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, - bool *shared) + struct xfs_bmbt_irec *imap) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); struct xfs_bmbt_irec got; int error = 0; - bool eof = false, trimmed; + bool eof = false; struct xfs_iext_cursor icur; + bool shared; /* * Search the COW fork extent list first. This serves two purposes: @@ -273,18 +269,16 @@ xfs_reflink_reserve_cow( if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - - *shared = true; return 0; } /* Trim the mapping to the nearest shared extent boundary. */ - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); + error = xfs_reflink_trim_around_shared(ip, imap, &shared); if (error) return error; /* Not shared? Just report the (potentially capped) extent. */ - if (!*shared) + if (!shared) return 0; /* @@ -352,6 +346,50 @@ xfs_reflink_convert_cow( return error; } +/* + * Find the extent that maps the given range in the COW fork. Even if the extent + * is not shared we might have a preallocation for it in the COW fork. If so we + * use it that rather than trigger a new allocation. + */ +static int +xfs_find_trim_cow_extent( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + bool *shared, + bool *found) +{ + xfs_fileoff_t offset_fsb = imap->br_startoff; + xfs_filblks_t count_fsb = imap->br_blockcount; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + + *found = false; + + /* + * If we don't find an overlapping extent, trim the range we need to + * allocate to fit the hole we found. + */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = offset_fsb + count_fsb; + if (got.br_startoff > offset_fsb) { + xfs_trim_extent(imap, imap->br_startoff, + got.br_startoff - imap->br_startoff); + return xfs_reflink_trim_around_shared(ip, imap, shared); + } + + *shared = true; + if (isnullstartblock(got.br_startblock)) { + xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); + return 0; + } + + /* real extent found - no need to allocate */ + xfs_trim_extent(&got, offset_fsb, count_fsb); + *imap = got; + *found = true; + return 0; +} + /* Allocate all CoW reservations covering a range of blocks in a file. */ int xfs_reflink_allocate_cow( @@ -363,78 +401,64 @@ xfs_reflink_allocate_cow( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = imap->br_startoff; xfs_filblks_t count_fsb = imap->br_blockcount; - struct xfs_bmbt_irec got; - struct xfs_trans *tp = NULL; + struct xfs_trans *tp; int nimaps, error = 0; - bool trimmed; + bool found; xfs_filblks_t resaligned; xfs_extlen_t resblks = 0; - struct xfs_iext_cursor icur; -retry: - ASSERT(xfs_is_reflink_inode(ip)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_is_reflink_inode(ip)); - /* - * Even if the extent is not shared we might have a preallocation for - * it in the COW fork. If so use it. - */ - if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) && - got.br_startoff <= offset_fsb) { - *shared = true; - - /* If we have a real allocation in the COW fork we're done. */ - if (!isnullstartblock(got.br_startblock)) { - xfs_trim_extent(&got, offset_fsb, count_fsb); - *imap = got; - goto convert; - } + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + return error; + if (found) + goto convert; - xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); - } else { - error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed); - if (error || !*shared) - goto out; - } + resaligned = xfs_aligned_fsb_count(imap->br_startoff, + imap->br_blockcount, xfs_get_cowextsz_hint(ip)); + resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - if (!tp) { - resaligned = xfs_aligned_fsb_count(imap->br_startoff, - imap->br_blockcount, xfs_get_cowextsz_hint(ip)); - resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + xfs_iunlock(ip, *lockmode); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); + *lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, *lockmode); - xfs_iunlock(ip, *lockmode); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); - *lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, *lockmode); + if (error) + return error; - if (error) - return error; + error = xfs_qm_dqattach_locked(ip, false); + if (error) + goto out_trans_cancel; - error = xfs_qm_dqattach_locked(ip, false); - if (error) - goto out; - goto retry; + /* + * Check for an overlapping extent again now that we dropped the ilock. + */ + error = xfs_find_trim_cow_extent(ip, imap, shared, &found); + if (error || !*shared) + goto out_trans_cancel; + if (found) { + xfs_trans_cancel(tp); + goto convert; } error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0, XFS_QMOPT_RES_REGBLKS); if (error) - goto out; + goto out_trans_cancel; xfs_trans_ijoin(tp, ip, 0); - nimaps = 1; - /* Allocate the entire reservation as unwritten blocks. */ + nimaps = 1; error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount, XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, resblks, imap, &nimaps); if (error) - goto out_trans_cancel; + goto out_unreserve; xfs_inode_set_cowblocks_tag(ip); - - /* Finish up. */ error = xfs_trans_commit(tp); if (error) return error; @@ -447,12 +471,12 @@ retry: return -ENOSPC; convert: return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb); -out_trans_cancel: + +out_unreserve: xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0, XFS_QMOPT_RES_REGBLKS); -out: - if (tp) - xfs_trans_cancel(tp); +out_trans_cancel: + xfs_trans_cancel(tp); return error; } @@ -666,14 +690,12 @@ xfs_reflink_end_cow( if (!del.br_blockcount) goto prev_extent; - ASSERT(!isnullstartblock(got.br_startblock)); - /* - * Don't remap unwritten extents; these are - * speculatively preallocated CoW extents that have been - * allocated but have not yet been involved in a write. + * Only remap real extent that contain data. With AIO + * speculatively preallocations can leak into the range we + * are called upon, and we need to skip them. */ - if (got.br_state == XFS_EXT_UNWRITTEN) + if (!xfs_bmap_is_real_extent(&got)) goto prev_extent; /* Unmap the old blocks in the data fork. */ @@ -891,18 +913,18 @@ out_error: /* * Update destination inode size & cowextsize hint, if necessary. */ -STATIC int +int xfs_reflink_update_dest( struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, - bool is_dedupe) + unsigned int remap_flags) { struct xfs_mount *mp = dest->i_mount; struct xfs_trans *tp; int error; - if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) + if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0) return 0; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp); @@ -923,10 +945,6 @@ xfs_reflink_update_dest( dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; } - if (!is_dedupe) { - xfs_trans_ichgtime(tp, dest, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE); error = xfs_trans_commit(tp); @@ -1090,19 +1108,28 @@ out: /* * Iteratively remap one file's extents (and holes) to another's. */ -STATIC int +int xfs_reflink_remap_blocks( struct xfs_inode *src, - xfs_fileoff_t srcoff, + loff_t pos_in, struct xfs_inode *dest, - xfs_fileoff_t destoff, - xfs_filblks_t len, - xfs_off_t new_isize) + loff_t pos_out, + loff_t remap_len, + loff_t *remapped) { struct xfs_bmbt_irec imap; + xfs_fileoff_t srcoff; + xfs_fileoff_t destoff; + xfs_filblks_t len; + xfs_filblks_t range_len; + xfs_filblks_t remapped_len = 0; + xfs_off_t new_isize = pos_out + remap_len; int nimaps; int error = 0; - xfs_filblks_t range_len; + + destoff = XFS_B_TO_FSBT(src->i_mount, pos_out); + srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in); + len = XFS_B_TO_FSB(src->i_mount, remap_len); /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */ while (len) { @@ -1117,7 +1144,7 @@ xfs_reflink_remap_blocks( error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0); xfs_iunlock(src, lock_mode); if (error) - goto err; + break; ASSERT(nimaps == 1); trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE, @@ -1131,23 +1158,24 @@ xfs_reflink_remap_blocks( error = xfs_reflink_remap_extent(dest, &imap, destoff, new_isize); if (error) - goto err; + break; if (fatal_signal_pending(current)) { error = -EINTR; - goto err; + break; } /* Advance drange/srange */ srcoff += range_len; destoff += range_len; len -= range_len; + remapped_len += range_len; } - return 0; - -err: - trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + if (error) + trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_); + *remapped = min_t(loff_t, remap_len, + XFS_FSB_TO_B(src->i_mount, remapped_len)); return error; } @@ -1195,35 +1223,91 @@ retry: return 0; } +/* Unlock both inodes after they've been prepped for a range clone. */ +void +xfs_reflink_remap_unlock( + struct file *file_in, + struct file *file_out) +{ + struct inode *inode_in = file_inode(file_in); + struct xfs_inode *src = XFS_I(inode_in); + struct inode *inode_out = file_inode(file_out); + struct xfs_inode *dest = XFS_I(inode_out); + bool same_inode = (inode_in == inode_out); + + xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); + if (!same_inode) + xfs_iunlock(src, XFS_MMAPLOCK_SHARED); + inode_unlock(inode_out); + if (!same_inode) + inode_unlock_shared(inode_in); +} + /* - * Link a range of blocks from one file to another. + * If we're reflinking to a point past the destination file's EOF, we must + * zero any speculative post-EOF preallocations that sit between the old EOF + * and the destination file offset. + */ +static int +xfs_reflink_zero_posteof( + struct xfs_inode *ip, + loff_t pos) +{ + loff_t isize = i_size_read(VFS_I(ip)); + + if (pos <= isize) + return 0; + + trace_xfs_zero_eof(ip, isize, pos - isize); + return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL, + &xfs_iomap_ops); +} + +/* + * Prepare two files for range cloning. Upon a successful return both inodes + * will have the iolock and mmaplock held, the page cache of the out file will + * be truncated, and any leases on the out file will have been broken. This + * function borrows heavily from xfs_file_aio_write_checks. + * + * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't + * checked that the bytes beyond EOF physically match. Hence we cannot use the + * EOF block in the source dedupe range because it's not a complete block match, + * hence can introduce a corruption into the file that has it's block replaced. + * + * In similar fashion, the VFS file cloning also allows partial EOF blocks to be + * "block aligned" for the purposes of cloning entire files. However, if the + * source file range includes the EOF block and it lands within the existing EOF + * of the destination file, then we can expose stale data from beyond the source + * file EOF in the destination file. + * + * XFS doesn't support partial block sharing, so in both cases we have check + * these cases ourselves. For dedupe, we can simply round the length to dedupe + * down to the previous whole block and ignore the partial EOF block. While this + * means we can't dedupe the last block of a file, this is an acceptible + * tradeoff for simplicity on implementation. + * + * For cloning, we want to share the partial EOF block if it is also the new EOF + * block of the destination file. If the partial EOF block lies inside the + * existing destination EOF, then we have to abort the clone to avoid exposing + * stale data in the destination file. Hence we reject these clone attempts with + * -EINVAL in this case. */ int -xfs_reflink_remap_range( +xfs_reflink_remap_prep( struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, - u64 len, - bool is_dedupe) + loff_t *len, + unsigned int remap_flags) { struct inode *inode_in = file_inode(file_in); struct xfs_inode *src = XFS_I(inode_in); struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); - struct xfs_mount *mp = src->i_mount; bool same_inode = (inode_in == inode_out); - xfs_fileoff_t sfsbno, dfsbno; - xfs_filblks_t fsblen; - xfs_extlen_t cowextsize; ssize_t ret; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) - return -EOPNOTSUPP; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - /* Lock both files against IO */ ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out); if (ret) @@ -1244,9 +1328,9 @@ xfs_reflink_remap_range( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, - &len, is_dedupe); - if (ret <= 0) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, + len, remap_flags); + if (ret < 0 || *len == 0) goto out_unlock; /* Attach dquots to dest inode before changing block map */ @@ -1254,60 +1338,27 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; - trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - /* - * Clear out post-eof preallocations because we don't have page cache - * backing the delayed allocations and they'll never get freed on - * their own. + * Zero existing post-eof speculative preallocations in the destination + * file. */ - if (xfs_can_free_eofblocks(dest, true)) { - ret = xfs_free_eofblocks(dest); - if (ret) - goto out_unlock; - } - - /* Set flags and remap blocks. */ - ret = xfs_reflink_set_inode_flag(src, dest); + ret = xfs_reflink_zero_posteof(dest, pos_out); if (ret) goto out_unlock; - dfsbno = XFS_B_TO_FSBT(mp, pos_out); - sfsbno = XFS_B_TO_FSBT(mp, pos_in); - fsblen = XFS_B_TO_FSB(mp, len); - ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen, - pos_out + len); + /* Set flags and remap blocks. */ + ret = xfs_reflink_set_inode_flag(src, dest); if (ret) goto out_unlock; /* Zap any page cache for the destination file's range. */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - - /* - * Carry the cowextsize hint from src to dest if we're sharing the - * entire source file to the entire destination file, the source file - * has a cowextsize hint, and the destination file does not. - */ - cowextsize = 0; - if (pos_in == 0 && len == i_size_read(inode_in) && - (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) && - pos_out == 0 && len >= i_size_read(inode_out) && - !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)) - cowextsize = src->i_d.di_cowextsize; - - ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize, - is_dedupe); + truncate_inode_pages_range(&inode_out->i_data, + round_down(pos_out, PAGE_SIZE), + round_up(pos_out + *len, PAGE_SIZE) - 1); + return 1; out_unlock: - xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(src, XFS_MMAPLOCK_SHARED); - inode_unlock(inode_out); - if (!same_inode) - inode_unlock_shared(inode_in); - if (ret) - trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); + xfs_reflink_remap_unlock(file_in, file_out); return ret; } diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index c585ad9552b2..6d73daef1f13 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -10,10 +10,10 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal); extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, - struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); + struct xfs_bmbt_irec *irec, bool *shared); extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, - struct xfs_bmbt_irec *imap, bool *shared); + struct xfs_bmbt_irec *imap); extern int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, @@ -27,13 +27,24 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); -extern int xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, u64 len, bool is_dedupe); +extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t len, + unsigned int remap_flags); extern int xfs_reflink_inode_has_shared_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *has_shared); extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, struct xfs_trans **tpp); extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); +extern int xfs_reflink_remap_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, loff_t *len, + unsigned int remap_flags); +extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, + struct xfs_inode *dest, loff_t pos_out, loff_t remap_len, + loff_t *remapped); +extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, + xfs_extlen_t cowextsize, unsigned int remap_flags); +extern void xfs_reflink_remap_unlock(struct file *file_in, + struct file *file_out); #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 4e4423153071..cc509743facd 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -29,30 +29,30 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) char *desc; int endpoint; } xstats[] = { - { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, - { "abt", XFSSTAT_END_ALLOC_BTREE }, - { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, - { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, - { "dir", XFSSTAT_END_DIRECTORY_OPS }, - { "trans", XFSSTAT_END_TRANSACTIONS }, - { "ig", XFSSTAT_END_INODE_OPS }, - { "log", XFSSTAT_END_LOG_OPS }, - { "push_ail", XFSSTAT_END_TAIL_PUSHING }, - { "xstrat", XFSSTAT_END_WRITE_CONVERT }, - { "rw", XFSSTAT_END_READ_WRITE_OPS }, - { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, - { "icluster", XFSSTAT_END_INODE_CLUSTER }, - { "vnodes", XFSSTAT_END_VNODE_OPS }, - { "buf", XFSSTAT_END_BUF }, - { "abtb2", XFSSTAT_END_ABTB_V2 }, - { "abtc2", XFSSTAT_END_ABTC_V2 }, - { "bmbt2", XFSSTAT_END_BMBT_V2 }, - { "ibt2", XFSSTAT_END_IBT_V2 }, - { "fibt2", XFSSTAT_END_FIBT_V2 }, - { "rmapbt", XFSSTAT_END_RMAP_V2 }, - { "refcntbt", XFSSTAT_END_REFCOUNT }, + { "extent_alloc", xfsstats_offset(xs_abt_lookup) }, + { "abt", xfsstats_offset(xs_blk_mapr) }, + { "blk_map", xfsstats_offset(xs_bmbt_lookup) }, + { "bmbt", xfsstats_offset(xs_dir_lookup) }, + { "dir", xfsstats_offset(xs_trans_sync) }, + { "trans", xfsstats_offset(xs_ig_attempts) }, + { "ig", xfsstats_offset(xs_log_writes) }, + { "log", xfsstats_offset(xs_try_logspace)}, + { "push_ail", xfsstats_offset(xs_xstrat_quick)}, + { "xstrat", xfsstats_offset(xs_write_calls) }, + { "rw", xfsstats_offset(xs_attr_get) }, + { "attr", xfsstats_offset(xs_iflush_count)}, + { "icluster", xfsstats_offset(vn_active) }, + { "vnodes", xfsstats_offset(xb_get) }, + { "buf", xfsstats_offset(xs_abtb_2) }, + { "abtb2", xfsstats_offset(xs_abtc_2) }, + { "abtc2", xfsstats_offset(xs_bmbt_2) }, + { "bmbt2", xfsstats_offset(xs_ibt_2) }, + { "ibt2", xfsstats_offset(xs_fibt_2) }, + { "fibt2", xfsstats_offset(xs_rmap_2) }, + { "rmapbt", xfsstats_offset(xs_refcbt_2) }, + { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)}, /* we print both series of quota information together */ - { "qm", XFSSTAT_END_QM }, + { "qm", xfsstats_offset(xs_xstrat_bytes)}, }; /* Loop over all stats groups */ @@ -104,6 +104,10 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) #ifdef CONFIG_PROC_FS /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA + +#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims) +#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot) + static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ @@ -119,7 +123,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) int j; seq_printf(m, "qm"); - for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++) seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); return 0; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 130db070e4d8..34d704f703d2 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -41,17 +41,14 @@ enum { * XFS global statistics */ struct __xfsstats { -# define XFSSTAT_END_EXTENT_ALLOC 4 uint32_t xs_allocx; uint32_t xs_allocb; uint32_t xs_freex; uint32_t xs_freeb; -# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4) uint32_t xs_abt_lookup; uint32_t xs_abt_compare; uint32_t xs_abt_insrec; uint32_t xs_abt_delrec; -# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7) uint32_t xs_blk_mapr; uint32_t xs_blk_mapw; uint32_t xs_blk_unmap; @@ -59,21 +56,17 @@ struct __xfsstats { uint32_t xs_del_exlist; uint32_t xs_look_exlist; uint32_t xs_cmp_exlist; -# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4) uint32_t xs_bmbt_lookup; uint32_t xs_bmbt_compare; uint32_t xs_bmbt_insrec; uint32_t xs_bmbt_delrec; -# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4) uint32_t xs_dir_lookup; uint32_t xs_dir_create; uint32_t xs_dir_remove; uint32_t xs_dir_getdents; -# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3) uint32_t xs_trans_sync; uint32_t xs_trans_async; uint32_t xs_trans_empty; -# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7) uint32_t xs_ig_attempts; uint32_t xs_ig_found; uint32_t xs_ig_frecycle; @@ -81,13 +74,11 @@ struct __xfsstats { uint32_t xs_ig_dup; uint32_t xs_ig_reclaims; uint32_t xs_ig_attrchg; -# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5) uint32_t xs_log_writes; uint32_t xs_log_blocks; uint32_t xs_log_noiclogs; uint32_t xs_log_force; uint32_t xs_log_force_sleep; -# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10) uint32_t xs_try_logspace; uint32_t xs_sleep_logspace; uint32_t xs_push_ail; @@ -98,22 +89,17 @@ struct __xfsstats { uint32_t xs_push_ail_flushing; uint32_t xs_push_ail_restarts; uint32_t xs_push_ail_flush; -# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2) uint32_t xs_xstrat_quick; uint32_t xs_xstrat_split; -# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2) uint32_t xs_write_calls; uint32_t xs_read_calls; -# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4) uint32_t xs_attr_get; uint32_t xs_attr_set; uint32_t xs_attr_remove; uint32_t xs_attr_list; -# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3) uint32_t xs_iflush_count; uint32_t xs_icluster_flushcnt; uint32_t xs_icluster_flushinode; -# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8) uint32_t vn_active; /* # vnodes not on free lists */ uint32_t vn_alloc; /* # times vn_alloc called */ uint32_t vn_get; /* # times vn_get called */ @@ -122,7 +108,6 @@ struct __xfsstats { uint32_t vn_reclaim; /* # times vn_reclaim called */ uint32_t vn_remove; /* # times vn_remove called */ uint32_t vn_free; /* # times vn_free called */ -#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9) uint32_t xb_get; uint32_t xb_create; uint32_t xb_get_locked; @@ -133,28 +118,19 @@ struct __xfsstats { uint32_t xb_page_found; uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) uint32_t xs_abtb_2[__XBTS_MAX]; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) uint32_t xs_abtc_2[__XBTS_MAX]; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) uint32_t xs_bmbt_2[__XBTS_MAX]; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) uint32_t xs_ibt_2[__XBTS_MAX]; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) uint32_t xs_fibt_2[__XBTS_MAX]; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) uint32_t xs_rmap_2[__XBTS_MAX]; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) uint32_t xs_refcbt_2[__XBTS_MAX]; -#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) uint32_t xs_qm_dqreclaims; uint32_t xs_qm_dqreclaim_misses; uint32_t xs_qm_dquot_dups; uint32_t xs_qm_dqcachemisses; uint32_t xs_qm_dqcachehits; uint32_t xs_qm_dqwants; -#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) uint32_t xs_qm_dquot; uint32_t xs_qm_dquot_unused; /* Extra precision counters */ @@ -163,10 +139,12 @@ struct __xfsstats { uint64_t xs_read_bytes; }; +#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) + struct xfsstats { union { struct __xfsstats s; - uint32_t a[XFSSTAT_END_XQMSTAT]; + uint32_t a[xfsstats_offset(xs_qm_dquot)]; }; }; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 207ee302b1bb..d3e6cd063688 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -43,6 +43,7 @@ #include <linux/dax.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/magic.h> #include <linux/mount.h> #include <linux/mempool.h> #include <linux/writeback.h> @@ -933,6 +934,32 @@ xfs_fs_alloc_inode( return NULL; } +#ifdef DEBUG +static void +xfs_check_delalloc( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + + if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) + return; + do { + if (isnullstartblock(got.br_startblock)) { + xfs_warn(ip->i_mount, + "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", + ip->i_ino, + whichfork == XFS_DATA_FORK ? "data" : "cow", + got.br_startoff, got.br_blockcount); + } + } while (xfs_iext_next_extent(ifp, &icur, &got)); +} +#else +#define xfs_check_delalloc(ip, whichfork) do { } while (0) +#endif + /* * Now that the generic code is guaranteed not to be accessing * the linux inode, we can inactivate and reclaim the inode. @@ -951,7 +978,12 @@ xfs_fs_destroy_inode( xfs_inactive(ip); - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + XFS_STATS_INC(ip->i_mount, vn_reclaim); /* @@ -1097,7 +1129,7 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - statp->f_type = XFS_SB_MAGIC; + statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; id = huge_encode_dev(mp->m_ddev_targp->bt_dev); @@ -1650,7 +1682,7 @@ xfs_fs_fill_super( * we must configure the block size in the superblock before we run the * full mount process as the mount process can lookup and cache inodes. */ - sb->s_magic = XFS_SB_MAGIC; + sb->s_magic = XFS_SUPER_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ad315e83bc02..3043e5ed6495 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -473,7 +473,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index bedc5a5133a5..912b42f5fe4a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -259,6 +259,14 @@ xfs_trans_alloc( struct xfs_trans *tp; int error; + /* + * Allocate the handle before we do our freeze accounting and setting up + * GFP_NOFS allocation context so that we avoid lockdep false positives + * by doing GFP_KERNEL allocations inside sb_start_intwrite(). + */ + tp = kmem_zone_zalloc(xfs_trans_zone, + (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); + if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); @@ -270,8 +278,6 @@ xfs_trans_alloc( mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, - (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; tp->t_mountp = mp; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index c3d278e96ad1..a0c5dbda18aa 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -220,6 +220,7 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, uint); void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); +bool xfs_trans_buf_is_dirty(struct xfs_buf *bp); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 55326f971cb3..d3a4e89bf4a0 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -531,17 +531,33 @@ xfsaild( set_current_state(TASK_INTERRUPTIBLE); /* - * Check kthread_should_stop() after we set the task state - * to guarantee that we either see the stop bit and exit or - * the task state is reset to runnable such that it's not - * scheduled out indefinitely and detects the stop bit at - * next iteration. - * + * Check kthread_should_stop() after we set the task state to + * guarantee that we either see the stop bit and exit or the + * task state is reset to runnable such that it's not scheduled + * out indefinitely and detects the stop bit at next iteration. * A memory barrier is included in above task state set to * serialize again kthread_stop(). */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); + + /* + * The caller forces out the AIL before stopping the + * thread in the common case, which means the delwri + * queue is drained. In the shutdown case, the queue may + * still hold relogged buffers that haven't been + * submitted because they were pinned since added to the + * queue. + * + * Log I/O error processing stales the underlying buffer + * and clears the delwri state, expecting the buf to be + * removed on the next submission attempt. That won't + * happen if we're shutting down, so this is the last + * opportunity to release such buffers from the queue. + */ + ASSERT(list_empty(&ailp->ail_buf_list) || + XFS_FORCED_SHUTDOWN(ailp->ail_mount)); + xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 15919f67a88f..629f1479c9d2 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -264,11 +264,39 @@ xfs_trans_read_buf_map( return -EIO; } + /* + * Check if the caller is trying to read a buffer that is + * already attached to the transaction yet has no buffer ops + * assigned. Ops are usually attached when the buffer is + * attached to the transaction, or by the read caller if + * special circumstances. That didn't happen, which is not + * how this is supposed to go. + * + * If the buffer passes verification we'll let this go, but if + * not we have to shut down. Let the transaction cleanup code + * release this buffer when it kills the tranaction. + */ + ASSERT(bp->b_ops != NULL); + error = xfs_buf_ensure_ops(bp, ops); + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + + if (tp->t_flags & XFS_TRANS_DIRTY) + xfs_force_shutdown(tp->t_mountp, + SHUTDOWN_META_IO_ERROR); + + /* bad CRC means corrupted metadata */ + if (error == -EFSBADCRC) + error = -EFSCORRUPTED; + return error; + } + bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_read_buf_recur(bip); + ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } @@ -316,55 +344,58 @@ xfs_trans_read_buf_map( _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_read_buf(bp->b_log_item); } + ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } +/* Has this buffer been dirtied by anyone? */ +bool +xfs_trans_buf_is_dirty( + struct xfs_buf *bp) +{ + struct xfs_buf_log_item *bip = bp->b_log_item; + + if (!bip) + return false; + ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); +} + /* - * Release the buffer bp which was previously acquired with one of the - * xfs_trans_... buffer allocation routines if the buffer has not - * been modified within this transaction. If the buffer is modified - * within this transaction, do decrement the recursion count but do - * not release the buffer even if the count goes to 0. If the buffer is not - * modified within the transaction, decrement the recursion count and - * release the buffer if the recursion count goes to 0. + * Release a buffer previously joined to the transaction. If the buffer is + * modified within this transaction, decrement the recursion count but do not + * release the buffer even if the count goes to 0. If the buffer is not modified + * within the transaction, decrement the recursion count and release the buffer + * if the recursion count goes to 0. * - * If the buffer is to be released and it was not modified before - * this transaction began, then free the buf_log_item associated with it. + * If the buffer is to be released and it was not already dirty before this + * transaction began, then also free the buf_log_item associated with it. * - * If the transaction pointer is NULL, make this just a normal - * brelse() call. + * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( - xfs_trans_t *tp, - xfs_buf_t *bp) + struct xfs_trans *tp, + struct xfs_buf *bp) { - struct xfs_buf_log_item *bip; - int freed; + struct xfs_buf_log_item *bip = bp->b_log_item; - /* - * Default to a normal brelse() call if the tp is NULL. - */ - if (tp == NULL) { - ASSERT(bp->b_transp == NULL); + ASSERT(bp->b_transp == tp); + + if (!tp) { xfs_buf_relse(bp); return; } - ASSERT(bp->b_transp == tp); - bip = bp->b_log_item; + trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - trace_xfs_trans_brelse(bip); - /* - * If the release is just for a recursive lock, - * then decrement the count and return. + * If the release is for a recursive lookup, then decrement the count + * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; @@ -372,64 +403,24 @@ xfs_trans_brelse( } /* - * If the buffer is dirty within this transaction, we can't + * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; - - /* - * If the buffer has been invalidated, then we can't release - * it until the transaction commits to disk unless it is re-dirtied - * as part of this transaction. This prevents us from pulling - * the item from the AIL before we should. - */ if (bip->bli_flags & XFS_BLI_STALE) return; - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - /* - * Free up the log item descriptor tracking the released item. + * Unlink the log item from the transaction and clear the hold flag, if + * set. We wouldn't want the next user of the buffer to get confused. */ + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); xfs_trans_del_item(&bip->bli_item); + bip->bli_flags &= ~XFS_BLI_HOLD; - /* - * Clear the hold flag in the buf log item if it is set. - * We wouldn't want the next user of the buffer to - * get confused. - */ - if (bip->bli_flags & XFS_BLI_HOLD) { - bip->bli_flags &= ~XFS_BLI_HOLD; - } - - /* - * Drop our reference to the buf log item. - */ - freed = atomic_dec_and_test(&bip->bli_refcount); - - /* - * If the buf item is not tracking data in the log, then we must free it - * before releasing the buffer back to the free pool. - * - * If the fs has shutdown and we dropped the last reference, it may fall - * on us to release a (possibly dirty) bli if it never made it to the - * AIL (e.g., the aborted unpin already happened and didn't release it - * due to our reference). Since we're already shutdown and need - * ail_lock, just force remove from the AIL and release the bli here. - */ - if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { - xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); - xfs_buf_item_relse(bp); - } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { -/*** - ASSERT(bp->b_pincount == 0); -***/ - ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); - ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); - xfs_buf_item_relse(bp); - } + /* drop the reference to the bli */ + xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp); |