summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c9
-rw-r--r--fs/9p/vfs_addr.c37
-rw-r--r--fs/9p/vfs_inode.c2
-rw-r--r--fs/Kconfig8
-rw-r--r--fs/Kconfig.binfmt13
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/inode.c3
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/file.c6
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/dir.c18
-rw-r--r--fs/afs/file.c28
-rw-r--r--fs/afs/internal.h6
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/write.c19
-rw-r--r--fs/aio.c3
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/file.c3
-rw-r--r--fs/bfs/inode.c2
-rw-r--r--fs/binfmt_elf.c180
-rw-r--r--fs/binfmt_elf_fdpic.c20
-rw-r--r--fs/binfmt_elf_test.c64
-rw-r--r--fs/binfmt_flat.c7
-rw-r--r--fs/binfmt_misc.c6
-rw-r--r--fs/btrfs/Makefile1
-rw-r--r--fs/btrfs/backref.c7
-rw-r--r--fs/btrfs/block-group.c75
-rw-r--r--fs/btrfs/block-group.h1
-rw-r--r--fs/btrfs/btrfs_inode.h42
-rw-r--r--fs/btrfs/compression.c63
-rw-r--r--fs/btrfs/compression.h10
-rw-r--r--fs/btrfs/ctree.c108
-rw-r--r--fs/btrfs/ctree.h104
-rw-r--r--fs/btrfs/delalloc-space.c18
-rw-r--r--fs/btrfs/dev-replace.c18
-rw-r--r--fs/btrfs/disk-io.c276
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-io-tree.h4
-rw-r--r--fs/btrfs/extent-tree.c158
-rw-r--r--fs/btrfs/extent_io.c97
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/extent_map.h8
-rw-r--r--fs/btrfs/file-item.c76
-rw-r--r--fs/btrfs/file.c176
-rw-r--r--fs/btrfs/free-space-tree.c2
-rw-r--r--fs/btrfs/inode.c1301
-rw-r--r--fs/btrfs/ioctl.c577
-rw-r--r--fs/btrfs/lzo.c20
-rw-r--r--fs/btrfs/ordered-data.c132
-rw-r--r--fs/btrfs/ordered-data.h25
-rw-r--r--fs/btrfs/print-tree.c5
-rw-r--r--fs/btrfs/qgroup.c100
-rw-r--r--fs/btrfs/reflink.c43
-rw-r--r--fs/btrfs/relocation.c24
-rw-r--r--fs/btrfs/root-tree.c15
-rw-r--r--fs/btrfs/scrub.c2
-rw-r--r--fs/btrfs/send.c15
-rw-r--r--fs/btrfs/send.h2
-rw-r--r--fs/btrfs/space-info.c5
-rw-r--r--fs/btrfs/subpage.c2
-rw-r--r--fs/btrfs/super.c96
-rw-r--r--fs/btrfs/sysfs.c15
-rw-r--r--fs/btrfs/tests/extent-map-tests.c2
-rw-r--r--fs/btrfs/transaction.c120
-rw-r--r--fs/btrfs/transaction.h5
-rw-r--r--fs/btrfs/tree-checker.c68
-rw-r--r--fs/btrfs/tree-log.c1030
-rw-r--r--fs/btrfs/tree-log.h7
-rw-r--r--fs/btrfs/volumes.c147
-rw-r--r--fs/btrfs/volumes.h7
-rw-r--r--fs/btrfs/zoned.c167
-rw-r--r--fs/buffer.c117
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/cachefiles/io.c61
-rw-r--r--fs/cachefiles/xattr.c23
-rw-r--r--fs/ceph/addr.c345
-rw-r--r--fs/ceph/cache.h13
-rw-r--r--fs/ceph/caps.c16
-rw-r--r--fs/ceph/debugfs.c5
-rw-r--r--fs/ceph/dir.c17
-rw-r--r--fs/ceph/file.c83
-rw-r--r--fs/ceph/inode.c67
-rw-r--r--fs/ceph/locks.c8
-rw-r--r--fs/ceph/mds_client.c69
-rw-r--r--fs/ceph/mds_client.h15
-rw-r--r--fs/ceph/metric.c63
-rw-r--r--fs/ceph/metric.h63
-rw-r--r--fs/ceph/snap.c263
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c8
-rw-r--r--fs/ceph/super.h10
-rw-r--r--fs/ceph/xattr.c13
-rw-r--r--fs/cifs/cifs_swn.c6
-rw-r--r--fs/cifs/cifsacl.c9
-rw-r--r--fs/cifs/cifsfs.c17
-rw-r--r--fs/cifs/cifsproto.h3
-rw-r--r--fs/cifs/connect.c99
-rw-r--r--fs/cifs/dfs_cache.c2
-rw-r--r--fs/cifs/file.c270
-rw-r--r--fs/cifs/fs_context.c4
-rw-r--r--fs/cifs/fscache.c126
-rw-r--r--fs/cifs/fscache.h79
-rw-r--r--fs/cifs/inode.c8
-rw-r--r--fs/cifs/ntlmssp.h2
-rw-r--r--fs/cifs/sess.c17
-rw-r--r--fs/cifs/smb1ops.c4
-rw-r--r--fs/cifs/smb2ops.c18
-rw-r--r--fs/cifs/transport.c5
-rw-r--r--fs/cifs/xattr.c2
-rw-r--r--fs/coda/file.c1
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/compat_binfmt_elf.c2
-rw-r--r--fs/configfs/dir.c14
-rw-r--r--fs/coredump.c86
-rw-r--r--fs/crypto/crypto.c8
-rw-r--r--fs/crypto/inline_crypt.c93
-rw-r--r--fs/dax.c2
-rw-r--r--fs/dcache.c3
-rw-r--r--fs/direct-io.c3
-rw-r--r--fs/ecryptfs/mmap.c5
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efs/super.c2
-rw-r--r--fs/erofs/data.c20
-rw-r--r--fs/erofs/dir.c21
-rw-r--r--fs/erofs/erofs_fs.h5
-rw-r--r--fs/erofs/inode.c4
-rw-r--r--fs/erofs/internal.h4
-rw-r--r--fs/erofs/namei.c54
-rw-r--r--fs/erofs/super.c40
-rw-r--r--fs/erofs/sysfs.c8
-rw-r--r--fs/erofs/zdata.c297
-rw-r--r--fs/erofs/zmap.c78
-rw-r--r--fs/exec.c38
-rw-r--r--fs/exfat/inode.c3
-rw-r--r--fs/exfat/super.c2
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext2/super.c8
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/balloc.c1
-rw-r--r--fs/ext4/block_validity.c26
-rw-r--r--fs/ext4/ext4.h38
-rw-r--r--fs/ext4/ext4_jbd2.h2
-rw-r--r--fs/ext4/extents.c17
-rw-r--r--fs/ext4/fast_commit.c306
-rw-r--r--fs/ext4/fast_commit.h6
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inline.c32
-rw-r--r--fs/ext4/inode.c173
-rw-r--r--fs/ext4/ioctl.c10
-rw-r--r--fs/ext4/mballoc.c371
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/namei.c41
-rw-r--r--fs/ext4/orphan.c4
-rw-r--r--fs/ext4/page-io.c10
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c115
-rw-r--r--fs/ext4/sysfs.c8
-rw-r--r--fs/ext4/xattr.c6
-rw-r--r--fs/f2fs/Kconfig7
-rw-r--r--fs/f2fs/acl.c21
-rw-r--r--fs/f2fs/checkpoint.c89
-rw-r--r--fs/f2fs/compress.c17
-rw-r--r--fs/f2fs/data.c144
-rw-r--r--fs/f2fs/debug.c25
-rw-r--r--fs/f2fs/dir.c22
-rw-r--r--fs/f2fs/f2fs.h173
-rw-r--r--fs/f2fs/file.c181
-rw-r--r--fs/f2fs/gc.c53
-rw-r--r--fs/f2fs/hash.c2
-rw-r--r--fs/f2fs/inline.c4
-rw-r--r--fs/f2fs/inode.c7
-rw-r--r--fs/f2fs/namei.c82
-rw-r--r--fs/f2fs/node.c121
-rw-r--r--fs/f2fs/node.h3
-rw-r--r--fs/f2fs/recovery.c39
-rw-r--r--fs/f2fs/segment.c81
-rw-r--r--fs/f2fs/segment.h5
-rw-r--r--fs/f2fs/super.c115
-rw-r--r--fs/f2fs/sysfs.c50
-rw-r--r--fs/f2fs/verity.c4
-rw-r--r--fs/f2fs/xattr.c12
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fcntl.c18
-rw-r--r--fs/file_table.c6
-rw-r--r--fs/freevxfs/vxfs_super.c2
-rw-r--r--fs/fs-writeback.c40
-rw-r--r--fs/fscache/io.c28
-rw-r--r--fs/fuse/control.c17
-rw-r--r--fs/fuse/dax.c3
-rw-r--r--fs/fuse/dev.c20
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c34
-rw-r--r--fs/fuse/fuse_i.h1
-rw-r--r--fs/fuse/inode.c5
-rw-r--r--fs/fuse/ioctl.c9
-rw-r--r--fs/fuse/virtio_fs.c1
-rw-r--r--fs/gfs2/aops.c43
-rw-r--r--fs/gfs2/file.c7
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/lops.c1
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hfs/inode.c6
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/inode.c6
-rw-r--r--fs/hfsplus/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/file.c3
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c4
-rw-r--r--fs/inode.c2
-rw-r--r--fs/internal.h9
-rw-r--r--fs/io-wq.c114
-rw-r--r--fs/io_uring.c1302
-rw-r--r--fs/ioctl.c4
-rw-r--r--fs/iomap/buffered-io.c100
-rw-r--r--fs/iomap/direct-io.c7
-rw-r--r--fs/iomap/fiemap.c1
-rw-r--r--fs/iomap/trace.h2
-rw-r--r--fs/isofs/inode.c2
-rw-r--r--fs/jbd2/commit.c21
-rw-r--r--fs/jbd2/journal.c8
-rw-r--r--fs/jbd2/transaction.c126
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/inode.c3
-rw-r--r--fs/jfs/jfs_metapage.c14
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/ksmbd/auth.c27
-rw-r--r--fs/ksmbd/ksmbd_netlink.h2
-rw-r--r--fs/ksmbd/ntlmssp.h6
-rw-r--r--fs/ksmbd/smb2pdu.c45
-rw-r--r--fs/ksmbd/smb2pdu.h8
-rw-r--r--fs/ksmbd/smb_common.c5
-rw-r--r--fs/ksmbd/transport_rdma.c4
-rw-r--r--fs/ksmbd/vfs.h1
-rw-r--r--fs/ksmbd/xattr.h2
-rw-r--r--fs/libfs.c25
-rw-r--r--fs/lockd/svc.c24
-rw-r--r--fs/lockd/svcsubs.c18
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/mpage.c3
-rw-r--r--fs/namespace.c200
-rw-r--r--fs/nfs/callback.c66
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/dir.c28
-rw-r--r--fs/nfs/file.c32
-rw-r--r--fs/nfs/inode.c11
-rw-r--r--fs/nfs/nfs4proc.c6
-rw-r--r--fs/nfs/nfs4state.c1
-rw-r--r--fs/nfs/write.c22
-rw-r--r--fs/nfsd/Kconfig12
-rw-r--r--fs/nfsd/Makefile3
-rw-r--r--fs/nfsd/filecache.c7
-rw-r--r--fs/nfsd/flexfilelayout.c2
-rw-r--r--fs/nfsd/nfs3proc.c19
-rw-r--r--fs/nfsd/nfs3xdr.c4
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c13
-rw-r--r--fs/nfsd/nfs4state.c24
-rw-r--r--fs/nfsd/nfs4xdr.c20
-rw-r--r--fs/nfsd/nfscache.c33
-rw-r--r--fs/nfsd/nfsctl.c10
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfsfh.h20
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/nfssvc.c25
-rw-r--r--fs/nfsd/trace.h121
-rw-r--r--fs/nfsd/vfs.c67
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nfsd/xdr.h2
-rw-r--r--fs/nilfs2/inode.c40
-rw-r--r--fs/nilfs2/mdt.c3
-rw-r--r--fs/nilfs2/segbuf.c16
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c53
-rw-r--r--fs/notify/fsnotify.c14
-rw-r--r--fs/notify/mark.c4
-rw-r--r--fs/ntfs/aops.c21
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ntfs3/inode.c2
-rw-r--r--fs/ntfs3/super.c2
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/nodemanager.c2
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/ocfs2/file.c13
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/localalloc.c6
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/stack_user.c18
-rw-r--r--fs/ocfs2/super.c24
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/file.c3
-rw-r--r--fs/open.c1
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/orangefs/inode.c121
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/copy_up.c16
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/pipe.c24
-rw-r--r--fs/proc/base.c8
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/page.c1
-rw-r--r--fs/proc/task_mmu.c49
-rw-r--r--fs/proc/vmcore.c43
-rw-r--r--fs/pstore/platform.c38
-rw-r--r--fs/pstore/ram_core.c4
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/qnx6/inode.c2
-rw-r--r--fs/quota/dquot.c11
-rw-r--r--fs/read_write.c34
-rw-r--r--fs/reiserfs/Kconfig10
-rw-r--r--fs/reiserfs/inode.c56
-rw-r--r--fs/reiserfs/journal.c4
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/remap_range.c23
-rw-r--r--fs/romfs/super.c2
-rw-r--r--fs/splice.c24
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/stat.c49
-rw-r--r--fs/super.c19
-rw-r--r--fs/sync.c18
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/sysv/itree.c3
-rw-r--r--fs/tracefs/inode.c5
-rw-r--r--fs/ubifs/file.c34
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/udf/file.c3
-rw-r--r--fs/udf/inode.c3
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/ufs/inode.c3
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/unicode/Kconfig18
-rw-r--r--fs/unicode/Makefile6
-rw-r--r--fs/userfaultfd.c11
-rw-r--r--fs/vboxsf/file.c2
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/vboxsf/utils.c1
-rw-r--r--fs/xfs/libxfs/xfs_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c36
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h8
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h5
-rw-r--r--fs/xfs/scrub/attr.h2
-rw-r--r--fs/xfs/xfs_aops.c23
-rw-r--r--fs/xfs/xfs_bmap_item.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c9
-rw-r--r--fs/xfs/xfs_buf.c48
-rw-r--r--fs/xfs/xfs_buf_item.c5
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c86
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_icache.c12
-rw-r--r--fs/xfs/xfs_inode.c100
-rw-r--r--fs/xfs/xfs_inode.h11
-rw-r--r--fs/xfs/xfs_inode_item.c12
-rw-r--r--fs/xfs/xfs_ioctl.c4
-rw-r--r--fs/xfs/xfs_iops.c118
-rw-r--r--fs/xfs/xfs_log.c5
-rw-r--r--fs/xfs/xfs_log_cil.c24
-rw-r--r--fs/xfs/xfs_pnfs.c45
-rw-r--r--fs/xfs/xfs_qm.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c2
-rw-r--r--fs/xfs/xfs_reflink.c5
-rw-r--r--fs/xfs/xfs_rmap_item.c2
-rw-r--r--fs/xfs/xfs_super.c13
-rw-r--r--fs/xfs/xfs_trace.h8
-rw-r--r--fs/xfs/xfs_trans.c90
-rw-r--r--fs/xfs/xfs_trans.h6
-rw-r--r--fs/xfs/xfs_trans_ail.c47
-rw-r--r--fs/xfs/xfs_trans_priv.h3
-rw-r--r--fs/zonefs/super.c7
379 files changed, 9932 insertions, 5534 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 6aab046c98e2..79df61fe0e59 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -96,12 +96,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
dentry, dentry, from_kuid(&init_user_ns, uid),
any);
ret = NULL;
-
- if (d_inode(dentry))
- ret = v9fs_fid_find_inode(d_inode(dentry), uid);
-
/* we'll recheck under lock if there's anything to look in */
- if (!ret && dentry->d_fsdata) {
+ if (dentry->d_fsdata) {
struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata;
spin_lock(&dentry->d_lock);
@@ -113,6 +109,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
}
}
spin_unlock(&dentry->d_lock);
+ } else {
+ if (dentry->d_inode)
+ ret = v9fs_fid_find_inode(dentry->d_inode, uid);
}
return ret;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 9a10e68c5f30..76956c9d2af9 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -158,18 +158,9 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
return 1;
}
-/**
- * v9fs_invalidate_page - Invalidate a page completely or partially
- * @page: The page to be invalidated
- * @offset: offset of the invalidated region
- * @length: length of the invalidated region
- */
-
-static void v9fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(page);
-
folio_wait_fscache(folio);
}
@@ -249,16 +240,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
return retval;
}
-/**
- * v9fs_launder_page - Writeback a dirty page
- * @page: The page to be cleaned up
- *
- * Returns 0 on success.
- */
-
-static int v9fs_launder_page(struct page *page)
+static int v9fs_launder_folio(struct folio *folio)
{
- struct folio *folio = page_folio(page);
int retval;
if (folio_clear_dirty_for_io(folio)) {
@@ -376,25 +359,25 @@ out:
* Mark a page as having been made dirty and thus needing writeback. We also
* need to pin the cache object to write back to.
*/
-static int v9fs_set_page_dirty(struct page *page)
+static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct v9fs_inode *v9inode = V9FS_I(page->mapping->host);
+ struct v9fs_inode *v9inode = V9FS_I(mapping->host);
- return fscache_set_page_dirty(page, v9fs_inode_cookie(v9inode));
+ return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
}
#else
-#define v9fs_set_page_dirty __set_page_dirty_nobuffers
+#define v9fs_dirty_folio filemap_dirty_folio
#endif
const struct address_space_operations v9fs_addr_operations = {
.readpage = v9fs_vfs_readpage,
.readahead = v9fs_vfs_readahead,
- .set_page_dirty = v9fs_set_page_dirty,
+ .dirty_folio = v9fs_dirty_folio,
.writepage = v9fs_vfs_writepage,
.write_begin = v9fs_write_begin,
.write_end = v9fs_write_end,
.releasepage = v9fs_release_page,
- .invalidatepage = v9fs_invalidate_page,
- .launder_page = v9fs_launder_page,
+ .invalidate_folio = v9fs_invalidate_folio,
+ .launder_folio = v9fs_launder_folio,
.direct_IO = v9fs_direct_IO,
};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 2a10242c79c7..84c3cf7dffa5 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -228,7 +228,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
{
struct v9fs_inode *v9inode;
- v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL);
+ v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL);
if (!v9inode)
return NULL;
#ifdef CONFIG_9P_FSCACHE
diff --git a/fs/Kconfig b/fs/Kconfig
index 7a2b11c0b803..30b751c7f11a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,7 +48,7 @@ config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
- select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
+ depends on ZONE_DEVICE || FS_DAX_LIMITED
select FS_IOMAP
select DAX
help
@@ -344,7 +344,7 @@ config LOCKD
config LOCKD_V4
bool
- depends on NFSD_V3 || NFS_V3
+ depends on NFSD || NFS_V3
depends on FILE_LOCKING
default y
@@ -369,8 +369,8 @@ source "fs/ksmbd/Kconfig"
config SMBFS_COMMON
tristate
- default y if CIFS=y
- default m if CIFS=m
+ default y if CIFS=y || SMB_SERVER=y
+ default m if CIFS=m || SMB_SERVER=m
source "fs/coda/Kconfig"
source "fs/afs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4d5ae61580aa..21c6332fa785 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -28,6 +28,16 @@ config BINFMT_ELF
ld.so (check the file <file:Documentation/Changes> for location and
latest version).
+config BINFMT_ELF_KUNIT_TEST
+ bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS
+ depends on KUNIT=y && BINFMT_ELF=y
+ default KUNIT_ALL_TESTS
+ help
+ This builds the ELF loader KUnit tests, which try to gather
+ prior bug fixes into a regression test collection. This is really
+ only needed for debugging. Note that with CONFIG_COMPAT=y, the
+ compat_binfmt_elf KUnit test is also created.
+
config COMPAT_BINFMT_ELF
def_bool y
depends on COMPAT && BINFMT_ELF
@@ -36,6 +46,9 @@ config COMPAT_BINFMT_ELF
config ARCH_BINFMT_ELF_STATE
bool
+config ARCH_BINFMT_ELF_EXTRA_PHDRS
+ bool
+
config ARCH_HAVE_ELF_PROT
bool
diff --git a/fs/Makefile b/fs/Makefile
index dab324aea08f..208a74e0b00e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -96,7 +96,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/
obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
-obj-$(CONFIG_UNICODE) += unicode/
+obj-y += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/
obj-$(CONFIG_CIFS) += cifs/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 5156821bfe6a..561bc748c04a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -73,7 +73,8 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations adfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = adfs_readpage,
.writepage = adfs_writepage,
.write_begin = adfs_write_begin,
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index bdbd26e571ed..e8bfc38239cd 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep;
static struct inode *adfs_alloc_inode(struct super_block *sb)
{
struct adfs_inode_info *ei;
- ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 75ebd2b576ca..b3f81d84ff4c 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -453,7 +453,8 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations affs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
@@ -834,7 +835,8 @@ err_bh:
}
const struct address_space_operations affs_aops_ofs = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = affs_readpage_ofs,
//.writepage = affs_writepage_ofs,
.write_begin = affs_write_begin_ofs,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c609005a9eaa..4c5f30a83336 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
{
struct affs_inode_info *i;
- i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL);
if (!i)
return NULL;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index da9b4f8577a1..932e61e28e5d 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -42,10 +42,11 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags);
static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags);
-static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
-static int afs_dir_set_page_dirty(struct page *page)
+static bool afs_dir_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
BUG(); /* This should never happen. */
}
@@ -73,9 +74,9 @@ const struct inode_operations afs_dir_inode_operations = {
};
const struct address_space_operations afs_dir_aops = {
- .set_page_dirty = afs_dir_set_page_dirty,
+ .dirty_folio = afs_dir_dirty_folio,
.releasepage = afs_dir_releasepage,
- .invalidatepage = afs_dir_invalidatepage,
+ .invalidate_folio = afs_dir_invalidate_folio,
};
const struct dentry_operations afs_fs_dentry_operations = {
@@ -2019,13 +2020,12 @@ static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags)
/*
* Invalidate part or all of a folio.
*/
-static void afs_dir_invalidatepage(struct page *subpage, unsigned int offset,
- unsigned int length)
+static void afs_dir_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{%lu},%u,%u", folio_index(folio), offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
BUG_ON(!folio_test_locked(folio));
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 720818a7c166..0f9fdb284a20 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -21,8 +21,8 @@
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_readpage(struct file *file, struct page *page);
static int afs_symlink_readpage(struct file *file, struct page *page);
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
static void afs_readahead(struct readahead_control *ractl);
@@ -54,10 +54,10 @@ const struct inode_operations afs_file_inode_operations = {
const struct address_space_operations afs_file_aops = {
.readpage = afs_readpage,
.readahead = afs_readahead,
- .set_page_dirty = afs_set_page_dirty,
- .launder_page = afs_launder_page,
+ .dirty_folio = afs_dirty_folio,
+ .launder_folio = afs_launder_folio,
.releasepage = afs_releasepage,
- .invalidatepage = afs_invalidatepage,
+ .invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
.writepage = afs_writepage,
@@ -67,7 +67,7 @@ const struct address_space_operations afs_file_aops = {
const struct address_space_operations afs_symlink_aops = {
.readpage = afs_symlink_readpage,
.releasepage = afs_releasepage,
- .invalidatepage = afs_invalidatepage,
+ .invalidate_folio = afs_invalidate_folio,
};
static const struct vm_operations_struct afs_vm_ops = {
@@ -427,8 +427,8 @@ int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
* Adjust the dirty region of the page on truncation or full invalidation,
* getting rid of the markers altogether if the region is entirely invalidated.
*/
-static void afs_invalidate_dirty(struct folio *folio, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_dirty(struct folio *folio, size_t offset,
+ size_t length)
{
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
unsigned long priv;
@@ -485,16 +485,14 @@ full_invalidate:
* - release a page and clean up its private data if offset is 0 (indicating
* the entire page)
*/
-static void afs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void afs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct folio *folio = page_folio(page);
-
- _enter("{%lu},%u,%u", folio_index(folio), offset, length);
+ _enter("{%lu},%zu,%zu", folio->index, offset, length);
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (PagePrivate(page))
+ if (folio_get_private(folio))
afs_invalidate_dirty(folio, offset, length);
folio_wait_fscache(folio);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index b6f02321fc09..dc5032e10244 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1521,9 +1521,9 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
* write.c
*/
#ifdef CONFIG_AFS_FSCACHE
-extern int afs_set_page_dirty(struct page *);
+bool afs_dirty_folio(struct address_space *, struct folio *);
#else
-#define afs_set_page_dirty __set_page_dirty_nobuffers
+#define afs_dirty_folio filemap_dirty_folio
#endif
extern int afs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -1537,7 +1537,7 @@ extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-extern int afs_launder_page(struct page *);
+int afs_launder_folio(struct folio *);
/*
* xattr.c
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 5ec9fd97eccc..7592c0f469f1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -679,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
{
struct afs_vnode *vnode;
- vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+ vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL);
if (!vnode)
return NULL;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 5e9157d0da29..e1c17081d18e 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -22,9 +22,10 @@ static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len
* Mark a page as having been made dirty and thus needing writeback. We also
* need to pin the cache object to write back to.
*/
-int afs_set_page_dirty(struct page *page)
+bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- return fscache_set_page_dirty(page, afs_vnode_cache(AFS_FS_I(page->mapping->host)));
+ return fscache_dirty_folio(mapping, folio,
+ afs_vnode_cache(AFS_FS_I(mapping->host)));
}
static void afs_folio_start_fscache(bool caching, struct folio *folio)
{
@@ -703,7 +704,7 @@ static int afs_writepages_region(struct address_space *mapping,
struct folio *folio;
struct page *head_page;
ssize_t ret;
- int n;
+ int n, skips = 0;
_enter("%llx,%llx,", start, end);
@@ -754,8 +755,15 @@ static int afs_writepages_region(struct address_space *mapping,
#ifdef CONFIG_AFS_FSCACHE
folio_wait_fscache(folio);
#endif
+ } else {
+ start += folio_size(folio);
}
folio_put(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched())
+ break;
+ skips++;
+ }
continue;
}
@@ -972,9 +980,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
/*
* Clean up a page during invalidation.
*/
-int afs_launder_page(struct page *subpage)
+int afs_launder_folio(struct folio *folio)
{
- struct folio *folio = page_folio(subpage);
struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
struct iov_iter iter;
struct bio_vec bv[1];
@@ -982,7 +989,7 @@ int afs_launder_page(struct page *subpage)
unsigned int f, t;
int ret = 0;
- _enter("{%lx}", folio_index(folio));
+ _enter("{%lx}", folio->index);
priv = (unsigned long)folio_get_private(folio);
if (folio_clear_dirty_for_io(folio)) {
diff --git a/fs/aio.c b/fs/aio.c
index 4ceba13a7db0..7b66b93d5bfa 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -478,7 +478,7 @@ out:
#endif
static const struct address_space_operations aio_ctx_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
#if IS_ENABLED(CONFIG_MIGRATION)
.migratepage = aio_migratepage,
#endif
@@ -1478,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_flags = iocb_flags(req->ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
- req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
/*
* If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c1ba13d19024..b4b3567ac655 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb)
{
struct befs_inode_info *bi;
- bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index 7f8544abf636..03139344568f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -188,7 +188,8 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations bfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = bfs_readpage,
.writepage = bfs_writepage,
.write_begin = bfs_write_begin,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fd691e4815c5..1926bec2c850 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep;
static struct inode *bfs_alloc_inode(struct super_block *sb)
{
struct bfs_inode_info *bi;
- bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL);
+ bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 605017eb9349..6556e13ed95f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm);
#define ELF_CORE_EFLAGS 0
#endif
-#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1))
+#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1))
#define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1))
#define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1))
@@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
+#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
@@ -170,8 +172,8 @@ static int padzero(unsigned long elf_bss)
static int
create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr,
- unsigned long e_entry)
+ unsigned long interp_load_addr,
+ unsigned long e_entry, unsigned long phdr_addr)
{
struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
@@ -257,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
@@ -399,22 +401,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
-static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr)
+static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
- int i, first_idx = -1, last_idx = -1;
+ elf_addr_t min_addr = -1;
+ elf_addr_t max_addr = 0;
+ bool pt_load = false;
+ int i;
for (i = 0; i < nr; i++) {
- if (cmds[i].p_type == PT_LOAD) {
- last_idx = i;
- if (first_idx == -1)
- first_idx = i;
+ if (phdr[i].p_type == PT_LOAD) {
+ min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr));
+ max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz);
+ pt_load = true;
}
}
- if (first_idx == -1)
- return 0;
-
- return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz -
- ELF_PAGESTART(cmds[first_idx].p_vaddr);
+ return pt_load ? (max_addr - min_addr) : 0;
}
static int elf_read(struct file *file, void *buf, size_t len, loff_t pos)
@@ -823,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
- unsigned long load_addr = 0, load_bias = 0;
- int load_addr_set = 0;
+ unsigned long load_bias = 0, phdr_addr = 0;
+ int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
@@ -1074,12 +1075,12 @@ out_free_interp:
vaddr = elf_ppnt->p_vaddr;
/*
- * The first time through the loop, load_addr_set is false:
+ * The first time through the loop, first_pt_load is true:
* layout will be calculated. Once set, use MAP_FIXED since
* we know we've already safely mapped the entire region with
* MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
- if (load_addr_set) {
+ if (!first_pt_load) {
elf_flags |= MAP_FIXED;
} else if (elf_ex->e_type == ET_EXEC) {
/*
@@ -1117,7 +1118,7 @@ out_free_interp:
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
- if (alignment > ELF_MIN_ALIGN) {
+ if (interpreter || alignment > ELF_MIN_ALIGN) {
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
@@ -1135,14 +1136,25 @@ out_free_interp:
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
- }
- /*
- * Calculate the entire size of the ELF mapping (total_size).
- * (Note that load_addr_set is set to true later once the
- * initial mapping is performed.)
- */
- if (!load_addr_set) {
+ /*
+ * Calculate the entire size of the ELF mapping
+ * (total_size), used for the initial mapping,
+ * due to load_addr_set which is set to true later
+ * once the initial mapping is performed.
+ *
+ * Note that this is only sensible when the LOAD
+ * segments are contiguous (or overlapping). If
+ * used for LOADs that are far apart, this would
+ * cause the holes between LOADs to be mapped,
+ * running the risk of having the mapping fail,
+ * as it would be larger than the ELF file itself.
+ *
+ * As a result, only ET_DYN does this, since
+ * some ET_EXEC (e.g. ia64) may have large virtual
+ * memory holes between LOADs.
+ *
+ */
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
@@ -1159,16 +1171,25 @@ out_free_interp:
goto out_free_dentry;
}
- if (!load_addr_set) {
- load_addr_set = 1;
- load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset);
+ if (first_pt_load) {
+ first_pt_load = 0;
if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
- load_addr += load_bias;
reloc_func_desc = load_bias;
}
}
+
+ /*
+ * Figure out which segment in the file contains the Program
+ * Header table, and map to the associated memory address.
+ */
+ if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
+ elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
+ phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
+ elf_ppnt->p_vaddr;
+ }
+
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
@@ -1204,6 +1225,7 @@ out_free_interp:
}
e_entry = elf_ex->e_entry + load_bias;
+ phdr_addr += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1267,8 +1289,8 @@ out_free_interp:
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, elf_ex,
- load_addr, interp_load_addr, e_entry);
+ retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
+ e_entry, phdr_addr);
if (retval < 0)
goto out;
@@ -1619,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
* long file_ofs
* followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
*/
-static int fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
user_long_t *data;
user_long_t *start_end_ofs;
char *name_base, *name_curpos;
+ int i;
/* *Estimated* file count and total data size needed */
- count = mm->map_count;
+ count = cprm->vma_count;
if (count > UINT_MAX / 64)
return -EINVAL;
size = count * 64;
@@ -1651,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note)
name_base = name_curpos = ((char *)data) + names_ofs;
remaining = size - names_ofs;
count = 0;
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = &cprm->vma_meta[i];
struct file *file;
const char *filename;
- file = vma->vm_file;
+ file = m->file;
if (!file)
continue;
filename = file_path(file, name_curpos, remaining);
@@ -1675,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note)
memmove(name_curpos, filename, n);
name_curpos += n;
- *start_end_ofs++ = vma->vm_start;
- *start_end_ofs++ = vma->vm_end;
- *start_end_ofs++ = vma->vm_pgoff;
+ *start_end_ofs++ = m->start;
+ *start_end_ofs++ = m->end;
+ *start_end_ofs++ = m->pgoff;
count++;
}
@@ -1688,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note)
* Count usually is less than mm->map_count,
* we need to move filenames down.
*/
- n = mm->map_count - count;
+ n = cprm->vma_count - count;
if (n != 0) {
unsigned shift_bytes = n * 3 * sizeof(data[0]);
memmove(name_base - shift_bytes, name_base,
@@ -1744,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task,
static int fill_thread_core_info(struct elf_thread_core_info *t,
const struct user_regset_view *view,
- long signr, size_t *total)
+ long signr, struct elf_note_info *info)
{
- unsigned int i;
+ unsigned int note_iter, view_iter;
/*
* NT_PRSTATUS is the one special case, because the regset data
@@ -1760,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
PRSTATUS_SIZE, &t->prstatus);
- *total += notesize(&t->notes[0]);
+ info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
/*
* Each other regset might generate a note too. For each regset
- * that has no core_note_type or is inactive, we leave t->notes[i]
- * all zero and we'll know to skip writing it later.
+ * that has no core_note_type or is inactive, skip it.
*/
- for (i = 1; i < view->n; ++i) {
- const struct user_regset *regset = &view->regsets[i];
+ note_iter = 1;
+ for (view_iter = 1; view_iter < view->n; ++view_iter) {
+ const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
@@ -1786,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (ret < 0)
continue;
+ if (WARN_ON_ONCE(note_iter >= info->thread_notes))
+ break;
+
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX",
+ fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
note_type, ret, data);
- *total += notesize(&t->notes[i]);
+ info->size += notesize(&t->notes[note_iter]);
+ note_iter++;
}
return 1;
@@ -1800,7 +1826,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1872,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+ if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info))
return 0;
/*
@@ -1881,13 +1907,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
- fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo);
info->size += notesize(&info->signote);
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
- if (fill_files_note(&info->files) == 0)
+ if (fill_files_note(&info->files, cprm) == 0)
info->size += notesize(&info->files);
return 1;
@@ -2029,7 +2055,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2050,13 +2076,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
info->thread_status_size += sz;
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(&info->prstatus->common, current, siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+ fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo);
+ elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
/* Set up header */
fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
@@ -2072,18 +2098,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
fill_auxv_note(info->notes + 3, current->mm);
info->numnote = 4;
- if (fill_files_note(info->notes + info->numnote) == 0) {
+ if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
info->notes_files = info->notes + info->numnote;
info->numnote++;
}
/* Try to dump the FPU. */
- info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
- info->fpu);
+ info->prstatus->pr_fpvalid =
+ elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
if (info->prstatus->pr_fpvalid)
fill_note(info->notes + info->numnote++,
"CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
@@ -2169,8 +2195,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs, i;
- size_t vma_data_size;
+ int segs, i;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2178,16 +2203,12 @@ static int elf_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- struct core_vma_metadata *vma_meta;
-
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- return 0;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2201,7 +2222,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm))
goto end_coredump;
has_dumped = 1;
@@ -2226,7 +2247,7 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2246,8 +2267,8 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
@@ -2284,8 +2305,8 @@ static int elf_core_dump(struct coredump_params *cprm)
/* Align to page */
dump_skip_to(cprm, dataoff);
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
@@ -2302,7 +2323,6 @@ static int elf_core_dump(struct coredump_params *cprm)
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
@@ -2324,3 +2344,7 @@ static void __exit exit_elf_binfmt(void)
core_initcall(init_elf_binfmt);
module_exit(exit_elf_binfmt);
MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST
+#include "binfmt_elf_test.c"
+#endif
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c6f588dc4a9d..08d0c8797828 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = {
.load_binary = load_elf_fdpic_binary,
#ifdef CONFIG_ELF_CORE
.core_dump = elf_fdpic_core_dump,
-#endif
.min_coredump = ELF_EXEC_PAGESIZE,
+#endif
};
static int __init init_elf_fdpic_binfmt(void)
@@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm,
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs;
+ int segs;
int i;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
@@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
- struct core_vma_metadata *vma_meta = NULL;
- size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (!psinfo)
goto end_coredump;
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- goto end_coredump;
-
for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
size_t sz;
@@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
dump_skip_to(cprm, dataoff);
- if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
+ if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1652,7 +1647,6 @@ end_coredump:
thread_list = thread_list->next;
kfree(tmp);
}
- kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c
new file mode 100644
index 000000000000..11d734fec366
--- /dev/null
+++ b/fs/binfmt_elf_test.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <kunit/test.h>
+
+static void total_mapping_size_test(struct kunit *test)
+{
+ struct elf_phdr empty[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, },
+ { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, },
+ };
+ /*
+ * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \
+ * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}'
+ */
+ struct elf_phdr mount[] = {
+ { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, },
+ { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, },
+ { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, },
+ { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, },
+ { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, },
+ { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, },
+ };
+ size_t mount_size = 0xE070;
+ /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */
+ struct elf_phdr unordered[] = {
+ { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, },
+ { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, },
+ };
+
+ /* No headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0);
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0);
+ /* Empty headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0);
+ /* No PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0);
+ /* Empty PT_LOAD and non-PT_LOAD headers, no size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0);
+
+ /* Normal set of PT_LOADS, and expected size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size);
+ /* Unordered PT_LOADs result in same size. */
+ KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size);
+}
+
+static struct kunit_case binfmt_elf_test_cases[] = {
+ KUNIT_CASE(total_mapping_size_test),
+ {},
+};
+
+static struct kunit_suite binfmt_elf_test_suite = {
+ .name = KBUILD_MODNAME,
+ .test_cases = binfmt_elf_test_cases,
+};
+
+kunit_test_suite(binfmt_elf_test_suite);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 5d776f80ee50..626898150011 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -37,6 +37,7 @@
#include <linux/flat.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/coredump.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
@@ -97,13 +98,17 @@ static int load_flat_shared_library(int id, struct lib_info *p);
#endif
static int load_flat_binary(struct linux_binprm *);
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm);
+#endif
static struct linux_binfmt flat_format = {
.module = THIS_MODULE,
.load_binary = load_flat_binary,
+#ifdef CONFIG_COREDUMP
.core_dump = flat_core_dump,
.min_coredump = PAGE_SIZE
+#endif
};
/****************************************************************************/
@@ -112,12 +117,14 @@ static struct linux_binfmt flat_format = {
* Currently only a stub-function.
*/
+#ifdef CONFIG_COREDUMP
static int flat_core_dump(struct coredump_params *cprm)
{
pr_warn("Process %s:%d received signr %d and should have core dumped\n",
current->comm, current->pid, cprm->siginfo->si_signo);
return 1;
}
+#endif
/****************************************************************************/
/*
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index c07f35719ee3..e1eae7ea823a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -817,20 +817,16 @@ static struct file_system_type bm_fs_type = {
};
MODULE_ALIAS_FS("binfmt_misc");
-static struct ctl_table_header *binfmt_misc_header;
-
static int __init init_misc_binfmt(void)
{
int err = register_filesystem(&bm_fs_type);
if (!err)
insert_binfmt(&misc_format);
- binfmt_misc_header = register_sysctl_mount_point("fs/binfmt_misc");
- return 0;
+ return err;
}
static void __exit exit_misc_binfmt(void)
{
- unregister_sysctl_table(binfmt_misc_header);
unregister_binfmt(&misc_format);
unregister_filesystem(&bm_fs_type);
}
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 4188ba3fd8c3..99f9995670ea 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags)
subdir-ccflags-y += -Wno-missing-field-initializers
subdir-ccflags-y += -Wno-sign-compare
subdir-ccflags-y += -Wno-type-limits
+subdir-ccflags-y += -Wno-shift-negative-value
obj-$(CONFIG_BTRFS_FS) := btrfs.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c9ee579bc5a6..ebc392ea1d74 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
if (IS_ERR(eb)) {
free_pref(ref);
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_pref(ref);
free_extent_buffer(eb);
return -EIO;
}
+
if (lock)
btrfs_tree_read_lock(eb);
if (btrfs_header_level(eb) == 0)
@@ -1335,7 +1337,8 @@ again:
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
goto out;
- } else if (!extent_buffer_uptodate(eb)) {
+ }
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
ret = -EIO;
goto out;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 1db24e6d6d90..c22d287e020b 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -124,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
{
if (refcount_dec_and_test(&cache->refs)) {
WARN_ON(cache->pinned > 0);
- WARN_ON(cache->reserved > 0);
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on reserved > 0 in that
+ * case.
+ */
+ if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
+ WARN_ON(cache->reserved > 0);
/*
* A block_group shouldn't be on the discard_list anymore.
@@ -1513,8 +1522,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
- if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ sb_start_write(fs_info->sb);
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
+ sb_end_write(fs_info->sb);
return;
+ }
/*
* Long running balances can keep us blocked here for eternity, so
@@ -1522,6 +1535,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
return;
}
@@ -1596,6 +1610,7 @@ next:
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
@@ -1997,6 +2012,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
cache->flags = btrfs_stack_block_group_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
set_free_space_tree_thresholds(cache);
@@ -2279,7 +2295,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
spin_lock(&block_group->lock);
btrfs_set_stack_block_group_used(&bgi, block_group->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ block_group->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@@ -2435,6 +2451,27 @@ next:
btrfs_trans_release_chunk_metadata(trans);
}
+/*
+ * For extent tree v2 we use the block_group_item->chunk_offset to point at our
+ * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
+ */
+static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+{
+ u64 div = SZ_1G;
+ u64 index;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+
+ /* If we have a smaller fs index based on 128MiB. */
+ if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
+ div = SZ_128M;
+
+ offset = div64_u64(offset, div);
+ div64_u64_rem(offset, fs_info->nr_global_roots, &index);
+ return index;
+}
+
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 bytes_used, u64 type,
u64 chunk_offset, u64 size)
@@ -2455,6 +2492,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
+ cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
@@ -2544,6 +2583,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
int ret;
bool dirty_bg_running;
+ /*
+ * This can only happen when we are doing read-only scrub on read-only
+ * mount.
+ * In that case we should not start a new transaction on read-only fs.
+ * Thus here we skip all chunk allocations.
+ */
+ if (sb_rdonly(fs_info->sb)) {
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ ret = inc_block_group_ro(cache, 0);
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ return ret;
+ }
+
do {
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
@@ -2671,7 +2723,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
btrfs_set_stack_block_group_used(&bgi, cache->used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
- BTRFS_FIRST_CHUNK_TREE_OBJECTID);
+ cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
btrfs_mark_buffer_dirty(leaf);
@@ -3974,9 +4026,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
* important and indicates a real bug if this happens.
*/
if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_reserved > 0 ||
space_info->bytes_may_use > 0))
btrfs_dump_space_info(info, space_info, 0, 0);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due
+ * to an IO failure on a writeback attempt of one or more of its
+ * extent buffers, we could not do proper (and cheap) unaccounting
+ * of their reserved space, so don't warn on bytes_reserved > 0 in
+ * that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(info, space_info, 0, 0);
+ }
+
WARN_ON(space_info->reclaim_size > 0);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 5878b7ce3b78..93aabc68bb6a 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -68,6 +68,7 @@ struct btrfs_block_group {
u64 bytes_super;
u64 flags;
u64 cache_generation;
+ u64 global_root_id;
/*
* If the free space extent count exceeds this number, convert the block
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b3e46aabc3d8..47e72d72f7d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -14,6 +14,13 @@
#include "delayed-inode.h"
/*
+ * Since we search a directory based on f_pos (struct dir_context::pos) we have
+ * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
+ * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
+ */
+#define BTRFS_DIR_START_INDEX 2
+
+/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -173,8 +180,9 @@ struct btrfs_inode {
u64 disk_i_size;
/*
- * if this is a directory then index_cnt is the counter for the index
- * number for new files that are created
+ * If this is a directory then index_cnt is the counter for the index
+ * number for new files that are created. For an empty directory, this
+ * must be initialized to BTRFS_DIR_START_INDEX.
*/
u64 index_cnt;
@@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
spin_unlock(&inode->lock);
}
+/*
+ * Should be called while holding the inode's VFS lock in exclusive mode or in a
+ * context where no one else can access the inode concurrently (during inode
+ * creation or when loading an inode from disk).
+ */
+static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
+{
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ /*
+ * The inode may have been part of a reflink operation in the last
+ * transaction that modified it, and then a fsync has reset the
+ * last_reflink_trans to avoid subsequent fsyncs in the same
+ * transaction to do unnecessary work. So update last_reflink_trans
+ * to the last_trans value (we have to be pessimistic and assume a
+ * reflink happened).
+ *
+ * The ->last_trans is protected by the inode's spinlock and we can
+ * have a concurrent ordered extent completion update it. Also set
+ * last_reflink_trans to ->last_trans only if the former is less than
+ * the later, because we can be called in a context where
+ * last_reflink_trans was set to the current transaction generation
+ * while ->last_trans was not yet updated in the current transaction,
+ * and therefore has a lower value.
+ */
+ spin_lock(&inode->lock);
+ if (inode->last_reflink_trans < inode->last_trans)
+ inode->last_reflink_trans = inode->last_trans;
+ spin_unlock(&inode->lock);
+}
+
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
bool ret = false;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 71e5b2e9a1ba..be476f094300 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
bi_size += bvec->bv_len;
if (bio->bi_status)
- cb->errors = 1;
+ cb->status = bio->bi_status;
ASSERT(bi_size && bi_size <= cb->compressed_len);
last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits,
@@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b
return last_io;
}
-static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio)
+static void finish_compressed_bio_read(struct compressed_bio *cb)
{
unsigned int index;
struct page *page;
@@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi
}
/* Do io completion on the original bio */
- if (cb->errors) {
- bio_io_error(cb->orig_bio);
+ if (cb->status != BLK_STS_OK) {
+ cb->orig_bio->bi_status = cb->status;
+ bio_endio(cb->orig_bio);
} else {
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
- ASSERT(bio);
- ASSERT(!bio->bi_status);
/*
* We have verified the checksum already, set page checked so
* the end_io handlers know about it
*/
- ASSERT(!bio_flagged(bio, BIO_CLONED));
+ ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED));
bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) {
u64 bvec_start = page_offset(bvec->bv_page) +
bvec->bv_offset;
@@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio)
* Some IO in this cb have failed, just skip checksum as there
* is no way it could be correct.
*/
- if (cb->errors == 1)
+ if (cb->status != BLK_STS_OK)
goto csum_failed;
inode = cb->inode;
@@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio)
csum_failed:
if (ret)
- cb->errors = 1;
- finish_compressed_bio_read(cb, bio);
+ cb->status = errno_to_blk_status(ret);
+ finish_compressed_bio_read(cb);
out:
bio_put(bio);
}
@@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode,
unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct page *pages[16];
unsigned long nr_pages = end_index - index + 1;
+ const int errno = blk_status_to_errno(cb->status);
int i;
int ret;
- if (cb->errors)
- mapping_set_error(inode->i_mapping, -EIO);
+ if (errno)
+ mapping_set_error(inode->i_mapping, errno);
while (nr_pages > 0) {
ret = find_get_pages_contig(inode->i_mapping, index,
@@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
continue;
}
for (i = 0; i < ret; i++) {
- if (cb->errors)
+ if (errno)
SetPageError(pages[i]);
btrfs_page_clamp_clear_writeback(fs_info, pages[i],
cb->start, cb->len);
@@ -381,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb)
*/
btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
cb->start, cb->start + cb->len - 1,
- !cb->errors);
+ cb->status == BLK_STS_OK);
- end_compressed_writeback(inode, cb);
+ if (cb->writeback)
+ end_compressed_writeback(inode, cb);
/* Note, our inode could be gone now */
/*
@@ -506,7 +507,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css)
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = NULL;
@@ -524,13 +526,14 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (!cb)
return BLK_STS_RESOURCE;
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
- cb->errors = 0;
+ cb->status = BLK_STS_OK;
cb->inode = &inode->vfs_inode;
cb->start = start;
cb->len = len;
cb->mirror_num = 0;
cb->compressed_pages = compressed_pages;
cb->compressed_len = compressed_len;
+ cb->writeback = writeback;
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
@@ -591,7 +594,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (submit) {
if (!skip_sum) {
- ret = btrfs_csum_one_bio(inode, bio, start, 1);
+ ret = btrfs_csum_one_bio(inode, bio, start, true);
if (ret)
goto finish_cb;
}
@@ -808,7 +811,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_len;
u64 em_start;
struct extent_map *em;
- blk_status_t ret = BLK_STS_RESOURCE;
+ blk_status_t ret;
int faili = 0;
u8 *sums;
@@ -821,17 +824,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
- if (!em)
- return BLK_STS_IOERR;
+ if (!em) {
+ ret = BLK_STS_IOERR;
+ goto out;
+ }
ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
- if (!cb)
+ if (!cb) {
+ ret = BLK_STS_RESOURCE;
goto out;
+ }
refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits);
- cb->errors = 0;
+ cb->status = BLK_STS_OK;
cb->inode = inode;
cb->mirror_num = mirror_num;
sums = cb->sums;
@@ -851,8 +858,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
- if (!cb->compressed_pages)
+ if (!cb->compressed_pages) {
+ ret = BLK_STS_RESOURCE;
goto fail1;
+ }
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
@@ -938,7 +947,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio = NULL;
}
}
- return 0;
+ return BLK_STS_OK;
fail2:
while (faili >= 0) {
@@ -951,6 +960,8 @@ fail1:
kfree(cb);
out:
free_extent_map(em);
+ bio->bi_status = ret;
+ bio_endio(bio);
return ret;
finish_cb:
if (comp_bio) {
@@ -970,7 +981,7 @@ finish_cb:
*/
ASSERT(refcount_read(&cb->pending_sectors));
/* Now we are the only one referring @cb, can finish it safely. */
- finish_compressed_bio_read(cb, NULL);
+ finish_compressed_bio_read(cb);
return ret;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 56eef0821e3e..ac5b20731d2a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -22,6 +22,8 @@ struct btrfs_inode;
/* Maximum length of compressed data stored on disk */
#define BTRFS_MAX_COMPRESSED (SZ_128K)
+static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+
/* Maximum size of data before compression */
#define BTRFS_MAX_UNCOMPRESSED (SZ_128K)
@@ -52,8 +54,11 @@ struct compressed_bio {
/* The compression algorithm for this bio */
u8 compress_type;
+ /* Whether this is a write for writeback. */
+ bool writeback;
+
/* IO errors */
- u8 errors;
+ blk_status_t status;
int mirror_num;
/* for reads, this is the bio we are copying the data into */
@@ -95,7 +100,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
struct page **compressed_pages,
unsigned int nr_pages,
unsigned int write_flags,
- struct cgroup_subsys_state *blkcg_css);
+ struct cgroup_subsys_state *blkcg_css,
+ bool writeback);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a7db3f6f1b7b..0eecf98d0abb 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -846,9 +846,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
- if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ if (IS_ERR(eb))
+ return eb;
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
- eb = ERR_PTR(-EIO);
+ return ERR_PTR(-EIO);
}
return eb;
@@ -1436,13 +1438,13 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
- if (!ret) {
- *eb_ret = tmp;
- return 0;
+ if (ret) {
+ free_extent_buffer(tmp);
+ btrfs_release_path(p);
+ return -EIO;
}
- free_extent_buffer(tmp);
- btrfs_release_path(p);
- return -EIO;
+ *eb_ret = tmp;
+ return 0;
}
/*
@@ -1460,19 +1462,19 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
- if (!IS_ERR(tmp)) {
- /*
- * If the read above didn't mark this buffer up to date,
- * it will never end up being up to date. Set ret to EIO now
- * and give up so that our caller doesn't loop forever
- * on our EAGAINs.
- */
- if (!extent_buffer_uptodate(tmp))
- ret = -EIO;
- free_extent_buffer(tmp);
- } else {
- ret = PTR_ERR(tmp);
+ if (IS_ERR(tmp)) {
+ btrfs_release_path(p);
+ return PTR_ERR(tmp);
}
+ /*
+ * If the read above didn't mark this buffer up to date,
+ * it will never end up being up to date. Set ret to EIO now
+ * and give up so that our caller doesn't loop forever
+ * on our EAGAINs.
+ */
+ if (!extent_buffer_uptodate(tmp))
+ ret = -EIO;
+ free_extent_buffer(tmp);
btrfs_release_path(p);
return ret;
@@ -2990,16 +2992,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (free_space < data_size)
goto out_unlock;
- /* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
- free_space = btrfs_leaf_free_space(right);
- if (free_space < data_size)
- goto out_unlock;
-
left_nritems = btrfs_header_nritems(left);
if (left_nritems == 0)
goto out_unlock;
@@ -3224,7 +3221,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- /* cow and double check */
ret = btrfs_cow_block(trans, root, left,
path->nodes[1], slot - 1, &left,
BTRFS_NESTING_LEFT_COW);
@@ -3235,12 +3231,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
- free_space = btrfs_leaf_free_space(left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
-
if (check_sibling_keys(left, right)) {
ret = -EUCLEAN;
goto out;
@@ -4170,24 +4160,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
- u32 last_off;
- u32 dsize = 0;
int ret = 0;
int wret;
- int i;
u32 nritems;
leaf = path->nodes[0];
- last_off = btrfs_item_offset(leaf, slot + nr - 1);
-
- for (i = 0; i < nr; i++)
- dsize += btrfs_item_size(leaf, slot + i);
-
nritems = btrfs_header_nritems(leaf);
if (slot + nr != nritems) {
- int data_end = leaf_data_end(leaf);
+ const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
+ const int data_end = leaf_data_end(leaf);
struct btrfs_map_token token;
+ u32 dsize = 0;
+ int i;
+
+ for (i = 0; i < nr; i++)
+ dsize += btrfs_item_size(leaf, slot + i);
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
@@ -4227,24 +4215,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
fixup_low_keys(path, &disk_key, 1);
}
- /* delete the leaf if it is mostly empty */
+ /*
+ * Try to delete the leaf if it is mostly empty. We do this by
+ * trying to move all its items into its left and right neighbours.
+ * If we can't move all the items, then we don't delete it - it's
+ * not ideal, but future insertions might fill the leaf with more
+ * items, or items from other leaves might be moved later into our
+ * leaf due to deletions on those leaves.
+ */
if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
+ u32 min_push_space;
+
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
*/
slot = path->slots[1];
atomic_inc(&leaf->refs);
-
- wret = push_leaf_left(trans, root, path, 1, 1,
- 1, (u32)-1);
+ /*
+ * We want to be able to at least push one item to the
+ * left neighbour leaf, and that's the first item.
+ */
+ min_push_space = sizeof(struct btrfs_item) +
+ btrfs_item_size(leaf, 0);
+ wret = push_leaf_left(trans, root, path, 0,
+ min_push_space, 1, (u32)-1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
if (path->nodes[0] == leaf &&
btrfs_header_nritems(leaf)) {
- wret = push_leaf_right(trans, root, path, 1,
- 1, 1, 0);
+ /*
+ * If we were not able to push all items from our
+ * leaf to its left neighbour, then attempt to
+ * either push all the remaining items to the
+ * right neighbour or none. There's no advantage
+ * in pushing only some items, instead of all, as
+ * it's pointless to end up with a leaf having
+ * too few items while the neighbours can be full
+ * or nearly full.
+ */
+ nritems = btrfs_header_nritems(leaf);
+ min_push_space = leaf_space_used(leaf, 0, nritems);
+ wret = push_leaf_right(trans, root, path, 0,
+ min_push_space, 1, 0);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b4a9b1c58d22..b7631b88426e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_ordered_sum;
struct btrfs_ref;
struct btrfs_bio;
+struct btrfs_ioctl_encoded_io_args;
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
@@ -145,6 +146,11 @@ enum {
BTRFS_FS_STATE_DUMMY_FS_INFO,
BTRFS_FS_STATE_NO_CSUMS,
+
+ /* Indicates there was an error cleaning up a log tree. */
+ BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+
+ BTRFS_FS_STATE_COUNT
};
#define BTRFS_BACKREF_REV_MAX 256
@@ -271,8 +277,14 @@ struct btrfs_super_block {
/* the UUID written into btree blocks */
u8 metadata_uuid[BTRFS_FSID_SIZE];
+ /* Extent tree v2 */
+ __le64 block_group_root;
+ __le64 block_group_root_generation;
+ u8 block_group_root_level;
+
/* future expansion */
- __le64 reserved[28];
+ u8 reserved8[7];
+ __le64 reserved[25];
u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
@@ -297,6 +309,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+#ifdef CONFIG_BTRFS_DEBUG
+/*
+ * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+ */
+#define BTRFS_FEATURE_INCOMPAT_SUPP \
+ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
+ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
+ BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
+ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
+ BTRFS_FEATURE_INCOMPAT_ZONED | \
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+#else
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
@@ -311,6 +343,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
BTRFS_FEATURE_INCOMPAT_ZONED)
+#endif
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
@@ -599,6 +632,9 @@ enum {
/* Indicate that we want the transaction kthread to commit right now. */
BTRFS_FS_COMMIT_TRANS,
+ /* Indicate we have half completed snapshot deletions pending. */
+ BTRFS_FS_UNFINISHED_DROPS,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
@@ -630,6 +666,7 @@ struct btrfs_fs_info {
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
struct btrfs_root *data_reloc_root;
+ struct btrfs_root *block_group_root;
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
@@ -1024,6 +1061,8 @@ struct btrfs_fs_info {
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
+ u64 nr_global_roots;
+
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
@@ -1103,8 +1142,15 @@ enum {
BTRFS_ROOT_QGROUP_FLUSHING,
/* We started the orphan cleanup for this root. */
BTRFS_ROOT_ORPHAN_CLEANUP,
+ /* This root has a drop operation that was started previously. */
+ BTRFS_ROOT_UNFINISHED_DROP,
};
+static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+}
+
/*
* Record swapped tree blocks of a subvolume tree for delayed subtree trace
* code. For detail check comment in fs/btrfs/qgroup.c.
@@ -1596,25 +1642,25 @@ DECLARE_BTRFS_SETGET_BITS(64)
static inline u##bits btrfs_##name(const struct extent_buffer *eb, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_##bits(eb, s, offsetof(type, member)); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
} \
static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
const type *s) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
return btrfs_get_token_##bits(token, s, offsetof(type, member));\
} \
static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
type *s, u##bits val) \
{ \
- BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \
+ static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \
btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
@@ -1645,8 +1691,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item,
total_bytes));
}
@@ -1654,8 +1700,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s,
u64 val)
{
- BUILD_BUG_ON(sizeof(u64) !=
- sizeof(((struct btrfs_dev_item *)0))->total_bytes);
+ static_assert(sizeof(u64) ==
+ sizeof(((struct btrfs_dev_item *)0))->total_bytes);
WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize));
btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val);
}
@@ -2315,6 +2361,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
num_devices, 64);
+/*
+ * For extent tree v2 we overload the extent root with the block group root, as
+ * we will have multiple extent roots.
+ */
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup,
+ extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup,
+ extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level,
+ struct btrfs_root_backup, extent_root_level, 8);
+
/* struct btrfs_balance_item */
BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
@@ -2449,6 +2506,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block,
+ block_group_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation,
+ struct btrfs_super_block,
+ block_group_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block,
+ block_group_root_level, 8);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
@@ -2826,7 +2890,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@@ -3142,7 +3207,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 file_start, int contig);
+ u64 offset, bool one_ordered);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
@@ -3243,6 +3308,11 @@ int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded);
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
+
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
extern const struct iomap_dio_ops btrfs_dio_ops;
@@ -3288,7 +3358,7 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode);
+ struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
@@ -3305,6 +3375,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
@@ -3593,6 +3665,9 @@ do { \
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state)))
+#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
+ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
+ &(fs_info)->fs_state)))
__printf(5, 6)
__cold
@@ -3758,7 +3833,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
@@ -3870,5 +3945,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
#define PageOrdered(page) PagePrivate2(page)
#define SetPageOrdered(page) SetPagePrivate2(page)
#define ClearPageOrdered(page) ClearPagePrivate2(page)
+#define folio_test_ordered(folio) folio_test_private_2(folio)
+#define folio_set_ordered(folio) folio_set_private_2(folio)
+#define folio_clear_ordered(folio) folio_clear_private_2(folio)
#endif
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index fb46a28f5065..bd8267c4687d 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -270,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
}
static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
- u64 num_bytes, u64 *meta_reserve,
- u64 *qgroup_reserve)
+ u64 num_bytes, u64 disk_num_bytes,
+ u64 *meta_reserve, u64 *qgroup_reserve)
{
u64 nr_extents = count_max_extents(num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+ u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
@@ -288,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
*qgroup_reserve = nr_extents * fs_info->nodesize;
}
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ u64 disk_num_bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -318,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
}
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
/*
* We always want to do it this way, every other way is wrong and ends
@@ -329,8 +331,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
- &qgroup_reserve);
+ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ &meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
if (ret)
return ret;
@@ -349,7 +351,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
spin_lock(&inode->lock);
nr_extents = count_max_extents(num_bytes);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += num_bytes;
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -454,7 +456,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
- ret = btrfs_delalloc_reserve_metadata(inode, len);
+ ret = btrfs_delalloc_reserve_metadata(inode, len, len);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 62b9651ea662..71fd99b48283 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
struct rcu_string *name;
@@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
goto error;
}
rcu_assign_pointer(device->name, name);
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error;
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
@@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
- device->fs_devices = fs_info->fs_devices;
+ device->fs_devices = fs_devices;
ret = btrfs_get_dev_zone_info(device, false);
if (ret)
goto error;
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- list_add(&device->dev_list, &fs_info->fs_devices->devices);
- fs_info->fs_devices->num_devices++;
- fs_info->fs_devices->open_devices++;
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_add(&device->dev_list, &fs_devices->devices);
+ fs_devices->num_devices++;
+ fs_devices->open_devices++;
+ mutex_unlock(&fs_devices->device_list_mutex);
*device_out = device;
return 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 505ba21230b1..b30309f187cf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -441,17 +441,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb)
else
ret = btrfs_check_leaf_full(eb);
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
+ if (ret < 0)
+ goto error;
+
+ /*
+ * Also check the generation, the eb reached here must be newer than
+ * last committed. Or something seriously wrong happened.
+ */
+ if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) {
+ ret = -EUCLEAN;
btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
+ "block=%llu bad generation, have %llu expect > %llu",
+ eb->start, btrfs_header_generation(eb),
+ fs_info->last_trans_committed);
+ goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
return 0;
+
+error:
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
}
/* Checksum all dirty extent buffers in one bio_vec */
@@ -999,41 +1013,40 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
return try_release_extent_buffer(page);
}
-static void btree_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btree_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct extent_io_tree *tree;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
- extent_invalidatepage(tree, page, offset);
- btree_releasepage(page, GFP_NOFS);
- if (PagePrivate(page)) {
- btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
- "page private not zero on page %llu",
- (unsigned long long)page_offset(page));
- detach_page_private(page);
+ tree = &BTRFS_I(folio->mapping->host)->io_tree;
+ extent_invalidate_folio(tree, folio, offset);
+ btree_releasepage(&folio->page, GFP_NOFS);
+ if (folio_get_private(folio)) {
+ btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info,
+ "folio private not zero on folio %llu",
+ (unsigned long long)folio_pos(folio));
+ folio_detach_private(folio);
}
}
-static int btree_set_page_dirty(struct page *page)
-{
#ifdef DEBUG
- struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+static bool btree_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
struct btrfs_subpage *subpage;
struct extent_buffer *eb;
int cur_bit = 0;
- u64 page_start = page_offset(page);
+ u64 page_start = folio_pos(folio);
if (fs_info->sectorsize == PAGE_SIZE) {
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
BUG_ON(!atomic_read(&eb->refs));
btrfs_assert_tree_write_locked(eb);
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(subpage->dirty_bitmap);
while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
@@ -1059,18 +1072,20 @@ static int btree_set_page_dirty(struct page *page)
cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
}
-#endif
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
+#else
+#define btree_dirty_folio filemap_dirty_folio
+#endif
static const struct address_space_operations btree_aops = {
.writepages = btree_writepages,
.releasepage = btree_releasepage,
- .invalidatepage = btree_invalidatepage,
+ .invalidate_folio = btree_invalidate_folio,
#ifdef CONFIG_MIGRATION
.migratepage = btree_migratepage,
#endif
- .set_page_dirty = btree_set_page_dirty,
+ .dirty_folio = btree_dirty_folio,
};
struct extent_buffer *btrfs_find_create_tree_block(
@@ -1289,12 +1304,33 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
return root;
}
+static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ u64 ret;
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (bytenr)
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ else
+ block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
+ ASSERT(block_group);
+ if (!block_group)
+ return 0;
+ ret = block_group->global_root_id;
+ btrfs_put_block_group(block_group);
+
+ return ret;
+}
+
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
struct btrfs_key key = {
.objectid = BTRFS_CSUM_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
- .offset = 0,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
};
return btrfs_global_root(fs_info, &key);
@@ -1305,7 +1341,7 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
struct btrfs_key key = {
.objectid = BTRFS_EXTENT_TREE_OBJECTID,
.type = BTRFS_ROOT_ITEM_KEY,
- .offset = 0,
+ .offset = btrfs_global_root_id(fs_info, bytenr),
};
return btrfs_global_root(fs_info, &key);
@@ -1522,7 +1558,8 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = PTR_ERR(root->node);
root->node = NULL;
goto fail;
- } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+ }
+ if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
ret = -EIO;
goto fail;
}
@@ -1727,6 +1764,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->uuid_root);
btrfs_put_root(fs_info->fs_root);
btrfs_put_root(fs_info->data_reloc_root);
+ btrfs_put_root(fs_info->block_group_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -1925,8 +1963,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
- struct btrfs_root *root = arg;
- struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
int again;
while (1) {
@@ -1959,7 +1996,7 @@ static int cleaner_kthread(void *arg)
btrfs_run_delayed_iputs(fs_info);
- again = btrfs_clean_one_deleted_snapshot(root);
+ again = btrfs_clean_one_deleted_snapshot(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
/*
@@ -2095,8 +2132,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
{
const int next_backup = info->backup_root_index;
struct btrfs_root_backup *root_backup;
- struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
- struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
root_backup = info->super_for_commit->super_roots + next_backup;
@@ -2121,11 +2156,30 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_chunk_root_level(root_backup,
btrfs_header_level(info->chunk_root->node));
- btrfs_set_backup_extent_root(root_backup, extent_root->node->start);
- btrfs_set_backup_extent_root_gen(root_backup,
- btrfs_header_generation(extent_root->node));
- btrfs_set_backup_extent_root_level(root_backup,
- btrfs_header_level(extent_root->node));
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) {
+ btrfs_set_backup_block_group_root(root_backup,
+ info->block_group_root->node->start);
+ btrfs_set_backup_block_group_root_gen(root_backup,
+ btrfs_header_generation(info->block_group_root->node));
+ btrfs_set_backup_block_group_root_level(root_backup,
+ btrfs_header_level(info->block_group_root->node));
+ } else {
+ struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
+ struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
+
+ btrfs_set_backup_extent_root(root_backup,
+ extent_root->node->start);
+ btrfs_set_backup_extent_root_gen(root_backup,
+ btrfs_header_generation(extent_root->node));
+ btrfs_set_backup_extent_root_level(root_backup,
+ btrfs_header_level(extent_root->node));
+
+ btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
+ btrfs_set_backup_csum_root_gen(root_backup,
+ btrfs_header_generation(csum_root->node));
+ btrfs_set_backup_csum_root_level(root_backup,
+ btrfs_header_level(csum_root->node));
+ }
/*
* we might commit during log recovery, which happens before we set
@@ -2146,12 +2200,6 @@ static void backup_super_roots(struct btrfs_fs_info *info)
btrfs_set_backup_dev_root_level(root_backup,
btrfs_header_level(info->dev_root->node));
- btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
- btrfs_set_backup_csum_root_gen(root_backup,
- btrfs_header_generation(csum_root->node));
- btrfs_set_backup_csum_root_level(root_backup,
- btrfs_header_level(csum_root->node));
-
btrfs_set_backup_total_bytes(root_backup,
btrfs_super_total_bytes(info->super_copy));
btrfs_set_backup_bytes_used(root_backup,
@@ -2269,6 +2317,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->uuid_root);
free_root_extent_buffers(info->fs_root);
free_root_extent_buffers(info->data_reloc_root);
+ free_root_extent_buffers(info->block_group_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -2504,11 +2553,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = NULL;
btrfs_put_root(log_tree_root);
return ret;
- } else if (!extent_buffer_uptodate(log_tree_root->node)) {
+ }
+ if (!extent_buffer_uptodate(log_tree_root->node)) {
btrfs_err(fs_info, "failed to read log tree");
btrfs_put_root(log_tree_root);
return -EIO;
}
+
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
@@ -2533,6 +2584,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
{
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_root *root;
+ u64 max_global_id = 0;
int ret;
struct btrfs_key key = {
.objectid = objectid,
@@ -2568,6 +2620,13 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
break;
btrfs_release_path(path);
+ /*
+ * Just worry about this for extent tree, it'll be the same for
+ * everybody.
+ */
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ max_global_id = max(max_global_id, key.offset);
+
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
@@ -2585,6 +2644,9 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
}
btrfs_release_path(path);
+ if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ fs_info->nr_global_roots = max_global_id + 1;
+
if (!found || ret) {
if (objectid == BTRFS_CSUM_TREE_OBJECTID)
set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
@@ -2930,6 +2992,56 @@ out:
return ret;
}
+static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
+{
+ int ret = 0;
+
+ root->node = read_tree_block(root->fs_info, bytenr,
+ root->root_key.objectid, gen, level, NULL);
+ if (IS_ERR(root->node)) {
+ ret = PTR_ERR(root->node);
+ root->node = NULL;
+ return ret;
+ }
+ if (!extent_buffer_uptodate(root->node)) {
+ free_extent_buffer(root->node);
+ root->node = NULL;
+ return -EIO;
+ }
+
+ btrfs_set_root_node(&root->root_item, root->node);
+ root->commit_root = btrfs_root_node(root);
+ btrfs_set_root_refs(&root->root_item, 1);
+ return ret;
+}
+
+static int load_important_roots(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_super_block *sb = fs_info->super_copy;
+ u64 gen, bytenr;
+ int level, ret;
+
+ bytenr = btrfs_super_root(sb);
+ gen = btrfs_super_generation(sb);
+ level = btrfs_super_root_level(sb);
+ ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read tree root");
+ return ret;
+ }
+
+ if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ bytenr = btrfs_super_block_group_root(sb);
+ gen = btrfs_super_block_group_root_generation(sb);
+ level = btrfs_super_block_group_root_level(sb);
+ ret = load_super_root(fs_info->block_group_root, bytenr, gen, level);
+ if (ret)
+ btrfs_warn(fs_info, "couldn't read block group root");
+ return ret;
+}
+
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
{
int backup_index = find_newest_super_backup(fs_info);
@@ -2939,10 +3051,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
int ret = 0;
int i;
- for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
- u64 generation;
- int level;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ struct btrfs_root *root;
+
+ root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID,
+ GFP_KERNEL);
+ if (!root)
+ return -ENOMEM;
+ fs_info->block_group_root = root;
+ }
+ for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
if (handle_error) {
if (!IS_ERR(tree_root->node))
free_extent_buffer(tree_root->node);
@@ -2967,29 +3086,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
if (ret < 0)
return ret;
}
- generation = btrfs_super_generation(sb);
- level = btrfs_super_root_level(sb);
- tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
- BTRFS_ROOT_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(tree_root->node)) {
- handle_error = true;
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- btrfs_warn(fs_info, "couldn't read tree root");
- continue;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
+ ret = load_important_roots(fs_info);
+ if (ret) {
handle_error = true;
- ret = -EIO;
- btrfs_warn(fs_info, "error while reading tree root");
continue;
}
- btrfs_set_root_node(&tree_root->root_item, tree_root->node);
- tree_root->commit_root = btrfs_root_node(tree_root);
- btrfs_set_root_refs(&tree_root->root_item, 1);
-
/*
* No need to hold btrfs_root::objectid_mutex since the fs
* hasn't been fully initialised and we are the only user
@@ -3009,8 +3112,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
}
/* All successful */
- fs_info->generation = generation;
- fs_info->last_trans_committed = generation;
+ fs_info->generation = btrfs_header_generation(tree_root->node);
+ fs_info->last_trans_committed = fs_info->generation;
fs_info->last_reloc_trans = 0;
/* Always begin writing backup roots after the one being used */
@@ -3293,7 +3396,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
up_read(&fs_info->cleanup_work_sem);
mutex_lock(&fs_info->cleaner_mutex);
- ret = btrfs_recover_relocation(fs_info->tree_root);
+ ret = btrfs_recover_relocation(fs_info);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
@@ -3594,21 +3697,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
generation = btrfs_super_chunk_root_generation(disk_super);
level = btrfs_super_chunk_root_level(disk_super);
-
- chunk_root->node = read_tree_block(fs_info,
- btrfs_super_chunk_root(disk_super),
- BTRFS_CHUNK_TREE_OBJECTID,
- generation, level, NULL);
- if (IS_ERR(chunk_root->node) ||
- !extent_buffer_uptodate(chunk_root->node)) {
+ ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
+ generation, level);
+ if (ret) {
btrfs_err(fs_info, "failed to read chunk root");
- if (!IS_ERR(chunk_root->node))
- free_extent_buffer(chunk_root->node);
- chunk_root->node = NULL;
goto fail_tree_roots;
}
- btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
- chunk_root->commit_root = btrfs_root_node(chunk_root);
read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
offsetof(struct btrfs_header, chunk_tree_uuid),
@@ -3728,7 +3822,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
- fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
+ fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
goto fail_sysfs;
@@ -3813,6 +3907,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+ /* Kick the cleaner thread so it'll start deleting snapshots. */
+ if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
+ wake_up_process(fs_info->cleaner_kthread);
+
clear_oneshot:
btrfs_clear_oneshot_options(fs_info);
return 0;
@@ -4536,6 +4634,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*/
kthread_park(fs_info->cleaner_kthread);
+ /*
+ * If we had UNFINISHED_DROPS we could still be processing them, so
+ * clear that bit and wake up relocation so it can stop.
+ */
+ btrfs_wake_unfinished_drop(fs_info);
+
/* wait for the qgroup rescan worker to stop */
btrfs_qgroup_wait_for_completion(fs_info, false);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 5e8bef4b7563..2e10514ecda8 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -111,6 +111,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 04083ee5ae6e..c3eb52dbe61c 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -244,8 +244,8 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset);
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d89273c4b6b8..f477035a2ac2 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -598,7 +598,7 @@ fail:
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- int refs_to_drop, int *last_ref)
+ int refs_to_drop)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
@@ -631,7 +631,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
- *last_ref = 1;
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1072,8 +1071,7 @@ static noinline_for_stack
void update_inline_extent_backref(struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op,
- int *last_ref)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_extent_item *ei;
@@ -1121,7 +1119,6 @@ void update_inline_extent_backref(struct btrfs_path *path,
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
- *last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
item_size = btrfs_item_size(leaf, path->slots[0]);
ptr = (unsigned long)iref;
@@ -1166,8 +1163,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
}
return -EUCLEAN;
}
- update_inline_extent_backref(path, iref, refs_to_add,
- extent_op, NULL);
+ update_inline_extent_backref(path, iref, refs_to_add, extent_op);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(trans->fs_info, path, iref, parent,
root_objectid, owner, offset,
@@ -1181,21 +1177,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
- int refs_to_drop, int is_data, int *last_ref)
+ int refs_to_drop, int is_data)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
- if (iref) {
- update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
- last_ref);
- } else if (is_data) {
- ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
- last_ref);
- } else {
- *last_ref = 1;
+ if (iref)
+ update_inline_extent_backref(path, iref, -refs_to_drop, NULL);
+ else if (is_data)
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ else
ret = btrfs_del_item(trans, root, path);
- }
return ret;
}
@@ -2766,12 +2758,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
global_rsv->space_info == space_info) {
- u64 to_add = len;
-
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
- to_add = min(len, global_rsv->size -
- global_rsv->reserved);
+ u64 to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(fs_info,
space_info, to_add);
@@ -2862,6 +2853,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, bool is_data)
+{
+ int ret;
+
+ if (is_data) {
+ struct btrfs_root *csum_root;
+
+ csum_root = btrfs_csum_root(trans->fs_info, bytenr);
+ ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ }
+
+ ret = add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+
+ return ret;
+}
+
/*
* Drop one or more refs of @node.
*
@@ -2943,7 +2963,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 refs;
u64 bytenr = node->bytenr;
u64 num_bytes = node->num_bytes;
- int last_ref = 0;
bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
extent_root = btrfs_extent_root(info, bytenr);
@@ -3010,8 +3029,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
/* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, extent_root, path,
- NULL, refs_to_drop, is_data,
- &last_ref);
+ NULL, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3136,8 +3154,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
- iref, refs_to_drop, is_data,
- &last_ref);
+ iref, refs_to_drop, is_data);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3182,7 +3199,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
@@ -3191,28 +3207,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- if (is_data) {
- struct btrfs_root *csum_root;
- csum_root = btrfs_csum_root(info, bytenr);
- ret = btrfs_del_csums(trans, csum_root, bytenr,
- num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
- }
-
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
-
- ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data);
}
btrfs_release_path(path);
@@ -4605,6 +4600,28 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
return ret;
}
+static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
+
+ ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ if (ret)
+ return ret;
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, true);
+ if (ret) {
+ ASSERT(!ret);
+ btrfs_err(fs_info, "update block group failed for %llu %llu",
+ bytenr, num_bytes);
+ return ret;
+ }
+
+ trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes);
+ return 0;
+}
+
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, u64 owner, u64 offset,
@@ -4665,18 +4682,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- ins->objectid, ins->offset);
- BUG();
- }
- trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
- return ret;
+ return alloc_reserved_extent(trans, ins->objectid, ins->offset);
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
@@ -4694,7 +4700,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_delayed_tree_ref *ref;
u32 size = sizeof(*extent_item) + sizeof(*iref);
- u64 num_bytes;
u64 flags = extent_op->flags_to_set;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
@@ -4704,12 +4709,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
extent_key.offset = ref->level;
extent_key.type = BTRFS_METADATA_ITEM_KEY;
- num_bytes = fs_info->nodesize;
} else {
extent_key.offset = node->num_bytes;
extent_key.type = BTRFS_EXTENT_ITEM_KEY;
size += sizeof(*block_info);
- num_bytes = node->num_bytes;
}
path = btrfs_alloc_path();
@@ -4754,22 +4757,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
- ret = remove_from_free_space_tree(trans, extent_key.objectid,
- num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_update_block_group(trans, extent_key.objectid,
- fs_info->nodesize, true);
- if (ret) { /* -ENOENT, logic error */
- btrfs_err(fs_info, "update block group failed for %llu %llu",
- extent_key.objectid, extent_key.offset);
- BUG();
- }
-
- trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
- fs_info->nodesize);
- return ret;
+ return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
}
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -5622,6 +5610,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
int ret;
int level;
bool root_dropped = false;
+ bool unfinished_drop = false;
btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
@@ -5664,6 +5653,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
* already dropped.
*/
set_bit(BTRFS_ROOT_DELETING, &root->state);
+ unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
@@ -5839,6 +5830,13 @@ out_free:
btrfs_free_path(path);
out:
/*
+ * We were an unfinished drop root, check to see if there are any
+ * pending, and if not clear and wake up any waiters.
+ */
+ if (!err && unfinished_drop)
+ btrfs_maybe_wake_unfinished_drop(fs_info);
+
+ /*
* So if we need to stop dropping the snapshot for whatever reason we
* need to make sure to add it back to the dead root list so that we
* keep trying to do the work later. This also cleans up roots if we
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dee86911a4be..724e8fe06aa0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1507,17 +1507,17 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
{
+ struct address_space *mapping = inode->i_mapping;
unsigned long index = start >> PAGE_SHIFT;
unsigned long end_index = end >> PAGE_SHIFT;
- struct page *page;
+ struct folio *folio;
while (index <= end_index) {
- page = find_get_page(inode->i_mapping, index);
- BUG_ON(!page); /* Pages should be in the extent_io_tree */
- __set_page_dirty_nobuffers(page);
- account_page_redirty(page);
- put_page(page);
- index++;
+ folio = filemap_get_folio(mapping, index);
+ filemap_dirty_folio(mapping, folio);
+ folio_account_redirty(folio);
+ index += folio_nr_pages(folio);
+ folio_put(folio);
}
}
@@ -2610,6 +2610,7 @@ static bool btrfs_check_repairable(struct inode *inode,
* a good copy of the failed sector and if we succeed, we have setup
* everything for repair_io_failure to do the rest for us.
*/
+ ASSERT(failed_mirror);
failrec->failed_mirror = failed_mirror;
failrec->this_mirror++;
if (failrec->this_mirror == failed_mirror)
@@ -2639,7 +2640,6 @@ int btrfs_repair_one_sector(struct inode *inode,
const int icsum = bio_offset >> fs_info->sectorsize_bits;
struct bio *repair_bio;
struct btrfs_bio *repair_bbio;
- blk_status_t status;
btrfs_debug(fs_info,
"repair read error: read error at %llu", start);
@@ -2678,13 +2678,13 @@ int btrfs_repair_one_sector(struct inode *inode,
"repair read error: submitting new read to mirror %d",
failrec->this_mirror);
- status = submit_bio_hook(inode, repair_bio, failrec->this_mirror,
- failrec->bio_flags);
- if (status) {
- free_io_failure(failure_tree, tree, failrec);
- bio_put(repair_bio);
- }
- return blk_status_to_errno(status);
+ /*
+ * At this point we have a bio, so any errors from submit_bio_hook()
+ * will be handled by the endio on the repair_bio, so we can't return an
+ * error here.
+ */
+ submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags);
+ return BLK_STS_OK;
}
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
@@ -3068,6 +3068,14 @@ static void end_bio_extent_readpage(struct bio *bio)
if (is_data_inode(inode)) {
/*
+ * If we failed to submit the IO at all we'll have a
+ * mirror_num == 0, in which case we need to just mark
+ * the page with an error and unlock it and carry on.
+ */
+ if (mirror == 0)
+ goto readpage_ok;
+
+ /*
* btrfs_submit_read_repair() will handle all the good
* and bad sectors, we just continue to the next bvec.
*/
@@ -3321,7 +3329,6 @@ static int alloc_new_bio(struct btrfs_inode *inode,
bio_ctrl->bio_flags = bio_flags;
bio->bi_end_io = end_io_func;
bio->bi_private = &inode->io_tree;
- bio->bi_write_hint = inode->vfs_inode.i_write_hint;
bio->bi_opf = opf;
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
@@ -3534,7 +3541,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
}
em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
- if (em_cached && !IS_ERR_OR_NULL(em)) {
+ if (em_cached && !IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
@@ -3563,7 +3570,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 cur_end;
struct extent_map *em;
int ret = 0;
- int nr = 0;
size_t pg_offset = 0;
size_t iosize;
size_t blocksize = inode->i_sb->s_blocksize;
@@ -3608,9 +3614,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
em = __get_extent_map(inode, page, pg_offset, cur,
end - cur + 1, em_cached);
- if (IS_ERR_OR_NULL(em)) {
+ if (IS_ERR(em)) {
unlock_extent(tree, cur, end);
end_page_read(page, false, cur, end + 1 - cur);
+ ret = PTR_ERR(em);
break;
}
extent_offset = cur - em->start;
@@ -3721,9 +3728,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
end_bio_extent_readpage, 0,
this_bio_flag,
force_bio_submit);
- if (!ret) {
- nr++;
- } else {
+ if (ret) {
unlock_extent(tree, cur, cur + iosize - 1);
end_page_read(page, false, cur, iosize);
goto out;
@@ -3951,7 +3956,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
}
em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
- if (IS_ERR_OR_NULL(em)) {
+ if (IS_ERR(em)) {
btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
ret = PTR_ERR_OR_ZERO(em);
break;
@@ -4048,6 +4053,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
static int __extent_writepage(struct page *page, struct writeback_control *wbc,
struct extent_page_data *epd)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u64 page_start = page_offset(page);
@@ -4068,8 +4074,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset = offset_in_page(i_size);
if (page->index > end_index ||
(page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return 0;
}
@@ -4780,11 +4786,12 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
return ret;
}
if (cache) {
- /* Impiles write in zoned mode */
- btrfs_put_block_group(cache);
- /* Mark the last eb in a block group */
+ /*
+ * Implies write in zoned mode. Mark the last eb in a block group.
+ */
if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity)
set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags);
+ btrfs_put_block_group(cache);
}
ret = write_one_eb(eb, wbc, epd);
free_extent_buffer(eb);
@@ -5218,17 +5225,17 @@ void extent_readahead(struct readahead_control *rac)
}
/*
- * basic invalidatepage code, this waits on any locked or writeback
- * ranges corresponding to the page, and then deletes any extent state
+ * basic invalidate_folio code, this waits on any locked or writeback
+ * ranges corresponding to the folio, and then deletes any extent state
* records from the tree
*/
-int extent_invalidatepage(struct extent_io_tree *tree,
- struct page *page, unsigned long offset)
+int extent_invalidate_folio(struct extent_io_tree *tree,
+ struct folio *folio, size_t offset)
{
struct extent_state *cached_state = NULL;
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- size_t blocksize = page->mapping->host->i_sb->s_blocksize;
+ u64 start = folio_pos(folio);
+ u64 end = start + folio_size(folio) - 1;
+ size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
/* This function is only called for the btree inode */
ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
@@ -5238,7 +5245,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
return 0;
lock_extent_bits(tree, start, end, &cached_state);
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/*
* Currently for btree io tree, only EXTENT_LOCKED is utilized,
@@ -5390,7 +5397,7 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
break;
len = ALIGN(len, sectorsize);
em = btrfs_get_extent_fiemap(inode, offset, len);
- if (IS_ERR_OR_NULL(em))
+ if (IS_ERR(em))
return em;
/* if this isn't a hole return it */
@@ -6841,14 +6848,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ /*
+ * If we are using the commit root we could potentially clear a page
+ * Uptodate while we're using the extent buffer that we've previously
+ * looked up. We don't want to complain in this case, as the page was
+ * valid before, we just didn't write it out. Instead we want to catch
+ * the case where we didn't actually read the block properly, which
+ * would have !PageUptodate && !PageError, as we clear PageError before
+ * reading.
+ */
if (fs_info->sectorsize < PAGE_SIZE) {
- bool uptodate;
+ bool uptodate, error;
uptodate = btrfs_subpage_test_uptodate(fs_info, page,
eb->start, eb->len);
- WARN_ON(!uptodate);
+ error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
+ WARN_ON(!uptodate && !error);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!PageUptodate(page) && !PageError(page));
}
}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 5a36add21305..6fee14ce2e6b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
+ set_bit(EXTENT_FLAG_MERGED, &em->flags);
free_extent_map(merge);
}
}
@@ -490,6 +492,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
@@ -504,6 +508,8 @@ void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *new,
int modified)
{
+ lockdep_assert_held_write(&tree->lock);
+
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 8e217337dff9..d2fa32ffe304 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -25,6 +25,8 @@ enum {
EXTENT_FLAG_FILLING,
/* filesystem extent mapping type */
EXTENT_FLAG_FS_MAPPING,
+ /* This em is merged from two or more physically adjacent ems */
+ EXTENT_FLAG_MERGED,
};
struct extent_map {
@@ -40,6 +42,12 @@ struct extent_map {
u64 ram_bytes;
u64 block_start;
u64 block_len;
+
+ /*
+ * Generation of the extent map, for merged em it's the highest
+ * generation of all merged ems.
+ * For non-merged extents, it's from btrfs_file_extent_item::generation.
+ */
u64 generation;
unsigned long flags;
/* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 90c5c38836ab..c828f971a346 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -305,7 +305,7 @@ found:
read_extent_buffer(path->nodes[0], dst, (unsigned long)item,
ret * csum_size);
out:
- if (ret == -ENOENT)
+ if (ret == -ENOENT || ret == -EFBIG)
ret = 0;
return ret;
}
@@ -368,6 +368,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_bio *bbio = NULL;
struct btrfs_path *path;
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
@@ -377,6 +378,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
u8 *csum;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
int count = 0;
+ blk_status_t ret = BLK_STS_OK;
if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
@@ -400,7 +402,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return BLK_STS_RESOURCE;
if (!dst) {
- struct btrfs_bio *bbio = btrfs_bio(bio);
+ bbio = btrfs_bio(bio);
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
@@ -456,21 +458,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
search_len, csum_dst);
- if (count <= 0) {
- /*
- * Either we hit a critical error or we didn't find
- * the csum.
- * Either way, we put zero into the csums dst, and skip
- * to the next sector.
- */
+ if (count < 0) {
+ ret = errno_to_blk_status(count);
+ if (bbio)
+ btrfs_bio_free_csum(bbio);
+ break;
+ }
+
+ /*
+ * We didn't find a csum for this range. We need to make sure
+ * we complain loudly about this, because we are not NODATASUM.
+ *
+ * However for the DATA_RELOC inode we could potentially be
+ * relocating data extents for a NODATASUM inode, so the inode
+ * itself won't be marked with NODATASUM, but the extent we're
+ * copying is in fact NODATASUM. If we don't find a csum we
+ * assume this is the case.
+ */
+ if (count == 0) {
memset(csum_dst, 0, csum_size);
count = 1;
- /*
- * For data reloc inode, we need to mark the range
- * NODATASUM so that balance won't report false csum
- * error.
- */
if (BTRFS_I(inode)->root->root_key.objectid ==
BTRFS_DATA_RELOC_TREE_OBJECTID) {
u64 file_offset;
@@ -491,7 +499,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
}
btrfs_free_path(path);
- return BLK_STS_OK;
+ return ret;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -612,32 +620,33 @@ fail:
return ret;
}
-/*
- * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio
+/**
+ * Calculate checksums of the data contained inside a bio
+ *
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
- * @file_start: offset in file this bio begins to describe
- * @contig: Boolean. If true/1 means all bio vecs in this bio are
- * contiguous and they begin at @file_start in the file. False/0
- * means this bio can contain potentially discontiguous bio vecs
- * so the logical offset of each should be calculated separately.
+ * @offset: If (u64)-1, @bio may contain discontiguous bio vecs, so the
+ * file offsets are determined from the page offsets in the bio.
+ * Otherwise, this is the starting file offset of the bio vecs in
+ * @bio, which must be contiguous.
+ * @one_ordered: If true, @bio only refers to one ordered extent.
*/
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
- u64 file_start, int contig)
+ u64 offset, bool one_ordered)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
struct btrfs_ordered_sum *sums;
struct btrfs_ordered_extent *ordered = NULL;
+ const bool use_page_offsets = (offset == (u64)-1);
char *data;
struct bvec_iter iter;
struct bio_vec bvec;
int index;
- int nr_sectors;
+ unsigned int blockcount;
unsigned long total_bytes = 0;
unsigned long this_sum_bytes = 0;
int i;
- u64 offset;
unsigned nofs_flag;
nofs_flag = memalloc_nofs_save();
@@ -651,18 +660,13 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
- if (contig)
- offset = file_start;
- else
- offset = 0; /* shut up gcc */
-
sums->bytenr = bio->bi_iter.bi_sector << 9;
index = 0;
shash->tfm = fs_info->csum_shash;
bio_for_each_segment(bvec, bio, iter) {
- if (!contig)
+ if (use_page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
if (!ordered) {
@@ -681,13 +685,14 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
}
}
- nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info,
+ blockcount = BTRFS_BYTES_TO_BLKS(fs_info,
bvec.bv_len + fs_info->sectorsize
- 1);
- for (i = 0; i < nr_sectors; i++) {
- if (offset >= ordered->file_offset + ordered->num_bytes ||
- offset < ordered->file_offset) {
+ for (i = 0; i < blockcount; i++) {
+ if (!one_ordered &&
+ !in_range(offset, ordered->file_offset,
+ ordered->num_bytes)) {
unsigned long bytes_left;
sums->len = this_sum_bytes;
@@ -1211,6 +1216,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
extent_start = key.offset;
extent_end = btrfs_file_extent_end(path);
em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ em->generation = btrfs_file_extent_generation(leaf, fi);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
em->start = extent_start;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 11204dbbe053..9f455c96c974 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -50,11 +50,14 @@ struct inode_defrag {
/* root objectid */
u64 root;
- /* last offset we were able to defrag */
- u64 last_offset;
-
- /* if we've wrapped around back to zero once already */
- int cycled;
+ /*
+ * The extent size threshold for autodefrag.
+ *
+ * This value is different for compressed/non-compressed extents,
+ * thus needs to be passed from higher layer.
+ * (aka, inode_should_defrag())
+ */
+ u32 extent_thresh;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
@@ -107,8 +110,8 @@ static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
- if (defrag->last_offset > entry->last_offset)
- entry->last_offset = defrag->last_offset;
+ entry->extent_thresh = min(defrag->extent_thresh,
+ entry->extent_thresh);
return -EEXIST;
}
}
@@ -134,7 +137,7 @@ static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+ struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -160,6 +163,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
+ defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
@@ -179,34 +183,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
}
/*
- * Requeue the defrag object. If there is a defrag object that points to
- * the same inode in the tree, we will merge them together (by
- * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
- */
-static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
- struct inode_defrag *defrag)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
-
- if (!__need_auto_defrag(fs_info))
- goto out;
-
- /*
- * Here we don't check the IN_DEFRAG flag, because we need merge
- * them together.
- */
- spin_lock(&fs_info->defrag_inodes_lock);
- ret = __btrfs_add_inode_defrag(inode, defrag);
- spin_unlock(&fs_info->defrag_inodes_lock);
- if (ret)
- goto out;
- return;
-out:
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
-}
-
-/*
* pick the defragable inode that we want, if it doesn't exist, we will get
* the next one.
*/
@@ -278,8 +254,14 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct btrfs_root *inode_root;
struct inode *inode;
struct btrfs_ioctl_defrag_range_args range;
- int num_defrag;
- int ret;
+ int ret = 0;
+ u64 cur = 0;
+
+again:
+ if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
+ goto cleanup;
+ if (!__need_auto_defrag(fs_info))
+ goto cleanup;
/* get the inode */
inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
@@ -295,39 +277,30 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
goto cleanup;
}
+ if (cur >= i_size_read(inode)) {
+ iput(inode);
+ goto cleanup;
+ }
+
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
- range.start = defrag->last_offset;
+ range.start = cur;
+ range.extent_thresh = defrag->extent_thresh;
sb_start_write(fs_info->sb);
- num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+ ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- /*
- * if we filled the whole defrag batch, there
- * must be more work to do. Queue this defrag
- * again
- */
- if (num_defrag == BTRFS_DEFRAG_BATCH) {
- defrag->last_offset = range.start;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else if (defrag->last_offset && !defrag->cycled) {
- /*
- * we didn't fill our defrag batch, but
- * we didn't start at zero. Make sure we loop
- * around to the start of the file.
- */
- defrag->last_offset = 0;
- defrag->cycled = 1;
- btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
- } else {
- kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
- }
-
iput(inode);
- return 0;
+
+ if (ret < 0)
+ goto cleanup;
+
+ cur = max(cur + fs_info->sectorsize, range.start);
+ goto again;
+
cleanup:
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
@@ -718,7 +691,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int modify_tree = -1;
int update_refs;
int found = 0;
- int leafs_visited = 0;
struct btrfs_path *path = args->path;
args->bytes_found = 0;
@@ -756,7 +728,6 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
path->slots[0]--;
}
ret = 0;
- leafs_visited++;
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -768,7 +739,6 @@ next_slot:
ret = 0;
break;
}
- leafs_visited++;
leaf = path->nodes[0];
recow = 1;
}
@@ -1014,7 +984,7 @@ delete_extent_item:
* which case it unlocked our path, so check path->locks[0] matches a
* write lock.
*/
- if (!ret && args->replace_extent && leafs_visited == 1 &&
+ if (!ret && args->replace_extent &&
path->locks[0] == BTRFS_WRITE_LOCK &&
btrfs_leaf_free_space(leaf) >=
sizeof(struct btrfs_item) + args->extent_item_size) {
@@ -1749,7 +1719,8 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- reserve_bytes);
+ reserve_bytes,
+ reserve_bytes);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -2066,12 +2037,43 @@ out:
return err < 0 ? err : written;
}
-static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
- struct iov_iter *from)
+static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ loff_t count;
+ ssize_t ret;
+
+ btrfs_inode_lock(inode, 0);
+ count = encoded->len;
+ ret = generic_write_checks_count(iocb, &count);
+ if (ret == 0 && count != encoded->len) {
+ /*
+ * The write got truncated by generic_write_checks_count(). We
+ * can't do a partial encoded write.
+ */
+ ret = -EFBIG;
+ }
+ if (ret || encoded->len == 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, from, encoded->len);
+ if (ret < 0)
+ goto out;
+
+ ret = btrfs_do_encoded_write(iocb, from, encoded);
+out:
+ btrfs_inode_unlock(inode, 0);
+ return ret;
+}
+
+ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
{
struct file *file = iocb->ki_filp;
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
- ssize_t num_written = 0;
+ ssize_t num_written, num_sync;
const bool sync = iocb->ki_flags & IOCB_DSYNC;
/*
@@ -2082,22 +2084,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
if (BTRFS_FS_ERROR(inode->root->fs_info))
return -EROFS;
- if (!(iocb->ki_flags & IOCB_DIRECT) &&
- (iocb->ki_flags & IOCB_NOWAIT))
+ if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EOPNOTSUPP;
if (sync)
atomic_inc(&inode->sync_writers);
- if (iocb->ki_flags & IOCB_DIRECT)
- num_written = btrfs_direct_write(iocb, from);
- else
- num_written = btrfs_buffered_write(iocb, from);
+ if (encoded) {
+ num_written = btrfs_encoded_write(iocb, from, encoded);
+ num_sync = encoded->len;
+ } else if (iocb->ki_flags & IOCB_DIRECT) {
+ num_written = num_sync = btrfs_direct_write(iocb, from);
+ } else {
+ num_written = num_sync = btrfs_buffered_write(iocb, from);
+ }
btrfs_set_inode_last_sub_trans(inode);
- if (num_written > 0)
- num_written = generic_write_sync(iocb, num_written);
+ if (num_sync > 0) {
+ num_sync = generic_write_sync(iocb, num_sync);
+ if (num_sync < 0)
+ num_written = num_sync;
+ }
if (sync)
atomic_dec(&inode->sync_writers);
@@ -2106,6 +2114,11 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
return num_written;
}
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ return btrfs_do_write_iter(iocb, from, NULL);
+}
+
int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
@@ -2501,7 +2514,7 @@ out:
hole_em = alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_cache(inode, offset, end - 1, 0);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
} else {
hole_em->start = offset;
hole_em->len = end - offset;
@@ -2522,8 +2535,7 @@ out:
} while (ret == -EEXIST);
free_extent_map(hole_em);
if (ret)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
}
return 0;
@@ -2877,7 +2889,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
* maps for the replacement extents (or holes).
*/
if (extent_info && !extent_info->is_new_extent)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
if (ret)
goto out_trans;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 655aad0f9e1c..0ae54d8c10d6 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -25,6 +25,8 @@ static struct btrfs_root *btrfs_free_space_root(
.offset = 0,
};
+ if (btrfs_fs_incompat(block_group->fs_info, EXTENT_TREE_V2))
+ key.offset = block_group->global_root_id;
return btrfs_global_root(block_group->fs_info, &key);
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3b2403b6127f..aa0a60ee26cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,11 @@ struct btrfs_dio_data {
struct extent_changeset *data_reserved;
};
+struct btrfs_rename_ctx {
+ /* Output field. Stores the index number of the old directory entry. */
+ u64 index;
+};
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -234,12 +239,14 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* no overlapping inline items exist in the btree
*/
static int insert_inline_extent(struct btrfs_trans_handle *trans,
- struct btrfs_path *path, bool extent_inserted,
- struct btrfs_root *root, struct inode *inode,
- u64 start, size_t size, size_t compressed_size,
+ struct btrfs_path *path,
+ struct btrfs_inode *inode, bool extent_inserted,
+ size_t size, size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
+ struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
struct page *page = NULL;
char *kaddr;
@@ -247,7 +254,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *ei;
int ret;
size_t cur_size = size;
- unsigned long offset;
+ u64 i_size;
ASSERT((compressed_size > 0 && compressed_pages) ||
(compressed_size == 0 && !compressed_pages));
@@ -259,8 +266,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
size_t datasize;
- key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = start;
+ key.objectid = btrfs_ino(inode);
+ key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -298,12 +305,10 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, ei,
compress_type);
} else {
- page = find_get_page(inode->i_mapping,
- start >> PAGE_SHIFT);
+ page = find_get_page(inode->vfs_inode.i_mapping, 0);
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_atomic(page);
- offset = offset_in_page(start);
- write_extent_buffer(leaf, kaddr + offset, ptr, size);
+ write_extent_buffer(leaf, kaddr, ptr, size);
kunmap_atomic(kaddr);
put_page(page);
}
@@ -314,21 +319,25 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
* We align size to sectorsize for inline extents just for simplicity
* sake.
*/
- size = ALIGN(size, root->fs_info->sectorsize);
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
+ ret = btrfs_inode_set_file_extent_range(inode, 0,
+ ALIGN(size, root->fs_info->sectorsize));
if (ret)
goto fail;
/*
- * we're an inline extent, so nobody can
- * extend the file past i_size without locking
- * a page we already have locked.
+ * We're an inline extent, so nobody can extend the file past i_size
+ * without locking a page we already have locked.
*
- * We must do any isize and inode updates
- * before we unlock the pages. Otherwise we
- * could end up racing with unlink.
+ * We must do any i_size and inode updates before we unlock the pages.
+ * Otherwise we could end up racing with unlink.
*/
- BTRFS_I(inode)->disk_i_size = inode->i_size;
+ i_size = i_size_read(&inode->vfs_inode);
+ if (update_i_size && size > i_size) {
+ i_size_write(&inode->vfs_inode, size);
+ i_size = size;
+ }
+ inode->disk_i_size = i_size;
+
fail:
return ret;
}
@@ -339,35 +348,31 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
- u64 end, size_t compressed_size,
+static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+ size_t compressed_size,
int compress_type,
- struct page **compressed_pages)
+ struct page **compressed_pages,
+ bool update_i_size)
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
- u64 isize = i_size_read(&inode->vfs_inode);
- u64 actual_end = min(end + 1, isize);
- u64 inline_len = actual_end - start;
- u64 aligned_end = ALIGN(end, fs_info->sectorsize);
- u64 data_len = inline_len;
+ u64 data_len = (compressed_size ?: size);
int ret;
struct btrfs_path *path;
- if (compressed_size)
- data_len = compressed_size;
-
- if (start > 0 ||
- actual_end > fs_info->sectorsize ||
+ /*
+ * We can create an inline extent if it ends at or beyond the current
+ * i_size, is no larger than a sector (decompressed), and the (possibly
+ * compressed) data fits in a leaf and the configured maximum inline
+ * size.
+ */
+ if (size < i_size_read(&inode->vfs_inode) ||
+ size > fs_info->sectorsize ||
data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
- (!compressed_size &&
- (actual_end & (fs_info->sectorsize - 1)) == 0) ||
- end + 1 < isize ||
- data_len > fs_info->max_inline) {
+ data_len > fs_info->max_inline)
return 1;
- }
path = btrfs_alloc_path();
if (!path)
@@ -381,30 +386,20 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
trans->block_rsv = &inode->block_rsv;
drop_args.path = path;
- drop_args.start = start;
- drop_args.end = aligned_end;
+ drop_args.start = 0;
+ drop_args.end = fs_info->sectorsize;
drop_args.drop_cache = true;
drop_args.replace_extent = true;
-
- if (compressed_size && compressed_pages)
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- compressed_size);
- else
- drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
- inline_len);
-
+ drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- if (isize > actual_end)
- inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
- root, &inode->vfs_inode, start,
- inline_len, compressed_size,
- compress_type, compressed_pages);
+ ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
+ size, compressed_size, compress_type,
+ compressed_pages, update_i_size);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -413,7 +408,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
+ btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, inode);
if (ret && ret != -ENOSPC) {
btrfs_abort_transaction(trans, ret);
@@ -423,7 +418,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
goto out;
}
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
out:
/*
* Don't forget to free the reserved space, as for inlined extent
@@ -560,12 +555,12 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
}
static inline void inode_should_defrag(struct btrfs_inode *inode,
- u64 start, u64 end, u64 num_bytes, u64 small_write)
+ u64 start, u64 end, u64 num_bytes, u32 small_write)
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
(start > 0 || end + 1 < inode->disk_i_size))
- btrfs_add_inode_defrag(NULL, inode);
+ btrfs_add_inode_defrag(NULL, inode, small_write);
}
/*
@@ -624,7 +619,6 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
- BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
nr_pages = min_t(unsigned long, nr_pages,
BTRFS_MAX_COMPRESSED / PAGE_SIZE);
@@ -735,14 +729,15 @@ cont:
/* we didn't compress the entire range, try
* to make an uncompressed inline extent.
*/
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
0, BTRFS_COMPRESS_NONE,
- NULL);
+ NULL, false);
} else {
/* try making a compressed inline extent */
- ret = cow_file_range_inline(BTRFS_I(inode), start, end,
+ ret = cow_file_range_inline(BTRFS_I(inode), actual_end,
total_compressed,
- compress_type, pages);
+ compress_type, pages,
+ false);
}
if (ret <= 0) {
unsigned long clear_flags = EXTENT_DELALLOC |
@@ -981,11 +976,14 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */
- ins.objectid, /* disk_bytenr */
- async_extent->ram_size, /* num_bytes */
- ins.offset, /* disk_num_bytes */
- async_extent->compress_type);
+ ret = btrfs_add_ordered_extent(inode, start, /* file_offset */
+ async_extent->ram_size, /* num_bytes */
+ async_extent->ram_size, /* ram_bytes */
+ ins.objectid, /* disk_bytenr */
+ ins.offset, /* disk_num_bytes */
+ 0, /* offset */
+ 1 << BTRFS_ORDERED_COMPRESSED,
+ async_extent->compress_type);
if (ret) {
btrfs_drop_extent_cache(inode, start, end, 0);
goto out_free_reserve;
@@ -1003,7 +1001,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
async_extent->pages, /* compressed_pages */
async_extent->nr_pages,
async_chunk->write_flags,
- async_chunk->blkcg_css)) {
+ async_chunk->blkcg_css, true)) {
const u64 start = async_extent->start;
const u64 end = start + async_extent->ram_size - 1;
@@ -1152,9 +1150,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* So here we skip inline extent creation completely.
*/
if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
+ u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
+ end + 1);
+
/* lets try to make an inline extent */
- ret = cow_file_range_inline(inode, start, end, 0,
- BTRFS_COMPRESS_NONE, NULL);
+ ret = cow_file_range_inline(inode, actual_end, 0,
+ BTRFS_COMPRESS_NONE, NULL, false);
if (ret == 0) {
/*
* We use DO_ACCOUNTING here because we need the
@@ -1234,9 +1235,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
- ram_size, cur_alloc_size,
- BTRFS_ORDERED_REGULAR);
+ ret = btrfs_add_ordered_extent(inode, start, ram_size, ram_size,
+ ins.objectid, cur_alloc_size, 0,
+ 1 << BTRFS_ORDERED_REGULAR,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto out_drop_extent_cache;
@@ -1895,10 +1897,11 @@ out_check:
goto error;
}
free_extent_map(em);
- ret = btrfs_add_ordered_extent(inode, cur_offset,
- disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_PREALLOC);
+ ret = btrfs_add_ordered_extent(inode,
+ cur_offset, num_bytes, num_bytes,
+ disk_bytenr, num_bytes, 0,
+ 1 << BTRFS_ORDERED_PREALLOC,
+ BTRFS_COMPRESS_NONE);
if (ret) {
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + num_bytes - 1,
@@ -1907,9 +1910,11 @@ out_check:
}
} else {
ret = btrfs_add_ordered_extent(inode, cur_offset,
+ num_bytes, num_bytes,
disk_bytenr, num_bytes,
- num_bytes,
- BTRFS_ORDERED_NOCOW);
+ 0,
+ 1 << BTRFS_ORDERED_NOCOW,
+ BTRFS_COMPRESS_NONE);
if (ret)
goto error;
}
@@ -2310,7 +2315,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
}
/*
@@ -2538,10 +2543,15 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
goto out;
if (bio_flags & EXTENT_BIO_COMPRESSED) {
+ /*
+ * btrfs_submit_compressed_read will handle completing
+ * the bio if there were any errors, so just return
+ * here.
+ */
ret = btrfs_submit_compressed_read(inode, bio,
mirror_num,
bio_flags);
- goto out;
+ goto out_no_endio;
} else {
/*
* Lookup bio sums does extra checks around whether we
@@ -2562,7 +2572,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
0, btrfs_submit_bio_start);
goto out;
} else if (!skip_sum) {
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
if (ret)
goto out;
}
@@ -2575,6 +2585,7 @@ out:
bio->bi_status = ret;
bio_endio(bio);
}
+out_no_endio:
return ret;
}
@@ -2870,6 +2881,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
+ u64 offset = btrfs_stack_file_extent_offset(stack_fi);
u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
struct btrfs_drop_extents_args drop_args = { 0 };
@@ -2944,7 +2956,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
goto out;
ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
- file_pos, qgroup_reserved, &ins);
+ file_pos - offset,
+ qgroup_reserved, &ins);
out:
btrfs_free_path(path);
@@ -2970,20 +2983,20 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
- u64 logical_len;
bool update_inode_bytes;
+ u64 num_bytes = oe->num_bytes;
+ u64 ram_bytes = oe->ram_bytes;
memset(&stack_fi, 0, sizeof(stack_fi));
btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
oe->disk_num_bytes);
+ btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
- logical_len = oe->truncated_len;
- else
- logical_len = oe->num_bytes;
- btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
- btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
+ num_bytes = ram_bytes = oe->truncated_len;
+ btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
+ btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
@@ -2994,6 +3007,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
* except if the ordered extent was truncated.
*/
update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
+ test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
@@ -3028,7 +3042,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
- !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
+ !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
clear_bits |= EXTENT_DELALLOC_NEW;
freespace_inode = btrfs_is_free_space_inode(inode);
@@ -4062,7 +4077,8 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir,
struct btrfs_inode *inode,
- const char *name, int name_len)
+ const char *name, int name_len,
+ struct btrfs_rename_ctx *rename_ctx)
{
struct btrfs_root *root = dir->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4118,15 +4134,27 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
skip_backref:
+ if (rename_ctx)
+ rename_ctx->index = index;
+
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto err;
}
- btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
- dir_ino);
- btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index);
+ /*
+ * If we are in a rename context, we don't need to update anything in the
+ * log. That will be done later during the rename by btrfs_log_new_name().
+ * Besides that, doing it here would only cause extra unncessary btree
+ * operations on the log tree, increasing latency for applications.
+ */
+ if (!rename_ctx) {
+ btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
+ dir_ino);
+ btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
+ index);
+ }
/*
* If we have a pending delayed iput we could end up with the final iput
@@ -4158,7 +4186,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
const char *name, int name_len)
{
int ret;
- ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
if (!ret) {
drop_nlink(&inode->vfs_inode);
ret = btrfs_update_inode(trans, inode->root, inode);
@@ -4565,14 +4593,21 @@ out_up_write:
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
int err = 0;
struct btrfs_trans_handle *trans;
u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
return btrfs_delete_subvolume(dir, dentry);
+ }
trans = __unlink_start_trans(dir);
if (IS_ERR(trans))
@@ -4611,7 +4646,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
}
out:
btrfs_end_transaction(trans);
- btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
+ btrfs_btree_balance_dirty(fs_info);
return err;
}
@@ -4664,7 +4699,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
goto out;
}
}
- ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
+ ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
if (ret < 0) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode, data_reserved,
@@ -4876,8 +4911,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
if (!hole_em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &inode->runtime_flags);
+ btrfs_set_inode_full_sync(inode);
goto next;
}
hole_em->start = cur_offset;
@@ -5046,16 +5080,17 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr
}
/*
- * While truncating the inode pages during eviction, we get the VFS calling
- * btrfs_invalidatepage() against each page of the inode. This is slow because
- * the calls to btrfs_invalidatepage() result in a huge amount of calls to
- * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
- * extent_state structures over and over, wasting lots of time.
+ * While truncating the inode pages during eviction, we get the VFS
+ * calling btrfs_invalidate_folio() against each folio of the inode. This
+ * is slow because the calls to btrfs_invalidate_folio() result in a
+ * huge amount of calls to lock_extent_bits() and clear_extent_bit(),
+ * which keep merging and splitting extent_state structures over and over,
+ * wasting lots of time.
*
- * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
- * those expensive operations on a per page basis and do only the ordered io
- * finishing, while we release here the extent_map and extent_state structures,
- * without the excessive merging and splitting.
+ * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
+ * skip all those expensive operations on a per folio basis and do only
+ * the ordered io finishing, while we release here the extent_map and
+ * extent_state structures, without the excessive merging and splitting.
*/
static void evict_inode_truncate_pages(struct inode *inode)
{
@@ -5121,7 +5156,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
* If still has DELALLOC flag, the extent didn't reach disk,
* and its reserved space won't be freed by delayed_ref.
* So we need to free its reserved space here.
- * (Refer to comment in btrfs_invalidatepage, case 2)
+ * (Refer to comment in btrfs_invalidate_folio, case 2)
*
* Note, end is the bytenr of last byte, so we need + 1 here.
*/
@@ -5584,21 +5619,17 @@ static struct inode *new_simple_dir(struct super_block *s,
return inode;
}
+static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
+static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
+static_assert(BTRFS_FT_DIR == FT_DIR);
+static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
+static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
+static_assert(BTRFS_FT_FIFO == FT_FIFO);
+static_assert(BTRFS_FT_SOCK == FT_SOCK);
+static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
+
static inline u8 btrfs_inode_type(struct inode *inode)
{
- /*
- * Compile-time asserts that generic FT_* types still match
- * BTRFS_FT_* types
- */
- BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
- BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
- BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
- BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
- BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
- BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
- BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
- BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);
-
return fs_umode_to_ftype(inode->i_mode);
}
@@ -5971,14 +6002,8 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
goto out;
ret = 0;
- /*
- * MAGIC NUMBER EXPLANATION:
- * since we search a directory based on f_pos we have to start at 2
- * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
- * else has to start at 2
- */
if (path->slots[0] == 0) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -5989,7 +6014,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
- inode->index_cnt = 2;
+ inode->index_cnt = BTRFS_DIR_START_INDEX;
goto out;
}
@@ -6140,7 +6165,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* sync since it will be a full sync anyway and this will blow away the
* old info in the log.
*/
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
key[0].objectid = objectid;
key[0].type = BTRFS_INODE_ITEM_KEY;
@@ -6537,7 +6562,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
+ btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
}
fail:
@@ -7040,8 +7065,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
if (IS_ERR(em))
goto out;
}
- ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
- block_len, type);
+ ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+ block_len, 0,
+ (1 << type) |
+ (1 << BTRFS_ORDERED_DIRECT),
+ BTRFS_COMPRESS_NONE);
if (ret) {
if (em) {
free_extent_map(em);
@@ -7441,7 +7469,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
struct extent_map *em2;
/* We can NOCOW, so only need to reserve metadata space. */
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
@@ -7600,6 +7628,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
}
len = min(len, em->len - (start - em->start));
+
+ /*
+ * If we have a NOWAIT request and the range contains multiple extents
+ * (or a mix of extents and holes), then we return -EAGAIN to make the
+ * caller fallback to a context where it can do a blocking (without
+ * NOWAIT) request. This way we avoid doing partial IO and returning
+ * success to the caller, which is not optimal for writes and for reads
+ * it can result in unexpected behaviour for an application.
+ *
+ * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
+ * iomap_dio_rw(), we can end up returning less data then what the caller
+ * asked for, resulting in an unexpected, and incorrect, short read.
+ * That is, the caller asked to read N bytes and we return less than that,
+ * which is wrong unless we are crossing EOF. This happens if we get a
+ * page fault error when trying to fault in pages for the buffer that is
+ * associated to the struct iov_iter passed to iomap_dio_rw(), and we
+ * have previously submitted bios for other extents in the range, in
+ * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
+ * those bios have completed by the time we get the page fault error,
+ * which we return back to our caller - we should only return EIOCBQUEUED
+ * after we have submitted bios for all the extents in the range.
+ */
+ if ((flags & IOMAP_NOWAIT) && len < length) {
+ free_extent_map(em);
+ ret = -EAGAIN;
+ goto unlock_err;
+ }
+
if (write) {
ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
start, len);
@@ -7803,7 +7859,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
struct bio *bio,
u64 dio_file_offset)
{
- return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
+ return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false);
}
static void btrfs_end_dio_bio(struct bio *bio)
@@ -7860,7 +7916,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
* If we aren't doing async submit, calculate the csum of the
* bio now.
*/
- ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
+ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
if (ret)
goto err;
} else {
@@ -8076,8 +8132,13 @@ int btrfs_readpage(struct file *file, struct page *page)
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
- if (bio_ctrl.bio)
- ret = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (bio_ctrl.bio) {
+ int ret2;
+
+ ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
+ if (ret == 0)
+ ret = ret2;
+ }
return ret;
}
@@ -8118,8 +8179,8 @@ static void btrfs_readahead(struct readahead_control *rac)
}
/*
- * For releasepage() and invalidatepage() we have a race window where
- * end_page_writeback() is called but the subpage spinlock is not yet released.
+ * For releasepage() and invalidate_folio() we have a race window where
+ * folio_end_writeback() is called but the subpage spinlock is not yet released.
* If we continue to release/invalidate the page, we could cause use-after-free
* for subpage spinlock. So this function is to spin and wait for subpage
* spinlock.
@@ -8195,48 +8256,48 @@ static int btrfs_migratepage(struct address_space *mapping,
}
#endif
-static void btrfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *tree = &inode->io_tree;
struct extent_state *cached_state = NULL;
- u64 page_start = page_offset(page);
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 page_start = folio_pos(folio);
+ u64 page_end = page_start + folio_size(folio) - 1;
u64 cur;
int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
- * We have page locked so no new ordered extent can be created on this
- * page, nor bio can be submitted for this page.
+ * We have folio locked so no new ordered extent can be created on this
+ * page, nor bio can be submitted for this folio.
*
- * But already submitted bio can still be finished on this page.
- * Furthermore, endio function won't skip page which has Ordered
+ * But already submitted bio can still be finished on this folio.
+ * Furthermore, endio function won't skip folio which has Ordered
* (Private2) already cleared, so it's possible for endio and
- * invalidatepage to do the same ordered extent accounting twice
- * on one page.
+ * invalidate_folio to do the same ordered extent accounting twice
+ * on one folio.
*
* So here we wait for any submitted bios to finish, so that we won't
- * do double ordered extent accounting on the same page.
+ * do double ordered extent accounting on the same folio.
*/
- wait_on_page_writeback(page);
- wait_subpage_spinlock(page);
+ folio_wait_writeback(folio);
+ wait_subpage_spinlock(&folio->page);
/*
* For subpage case, we have call sites like
* btrfs_punch_hole_lock_range() which passes range not aligned to
* sectorsize.
- * If the range doesn't cover the full page, we don't need to and
- * shouldn't clear page extent mapped, as page->private can still
+ * If the range doesn't cover the full folio, we don't need to and
+ * shouldn't clear page extent mapped, as folio->private can still
* record subpage dirty bits for other part of the range.
*
- * For cases that can invalidate the full even the range doesn't
- * cover the full page, like invalidating the last page, we're
+ * For cases that invalidate the full folio even the range doesn't
+ * cover the full folio, like invalidating the last folio, we're
* still safe to wait for ordered extent to finish.
*/
if (!(offset == 0 && length == PAGE_SIZE)) {
- btrfs_releasepage(page, GFP_NOFS);
+ btrfs_releasepage(&folio->page, GFP_NOFS);
return;
}
@@ -8277,7 +8338,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
+ if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8287,7 +8348,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
delete_states = false;
goto next;
}
- btrfs_page_clear_ordered(fs_info, page, cur, range_len);
+ btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8357,11 +8418,11 @@ next:
* should not have Ordered (Private2) anymore, or the above iteration
* did something wrong.
*/
- ASSERT(!PageOrdered(page));
- btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(!folio_test_ordered(folio));
+ btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
- __btrfs_releasepage(page, GFP_NOFS);
- clear_page_extent_mapped(page);
+ __btrfs_releasepage(&folio->page, GFP_NOFS);
+ clear_page_extent_mapped(&folio->page);
}
/*
@@ -8706,7 +8767,7 @@ out:
* extents beyond i_size to drop.
*/
if (control.extents_found > 0)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
return ret;
}
@@ -8759,7 +8820,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_inode *ei;
struct inode *inode;
- ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -9002,14 +9063,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
+ struct btrfs_rename_ctx old_rename_ctx;
+ struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
int ret2;
- bool root_log_pinned = false;
- bool dest_log_pinned = false;
bool need_abort = false;
/*
@@ -9112,29 +9173,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
- /*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
- *
- * We pin the logs even if at this precise moment none of the inodes was
- * logged before. This is because right after we checked for that, some
- * other task fsyncing some other inode not involved with this rename
- * operation could log that one of our inodes exists.
- *
- * We don't need to pin the logs before the above calls to
- * btrfs_insert_inode_ref(), since those don't ever need to change a log.
- */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
- }
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
- }
-
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9142,7 +9180,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &old_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9158,7 +9197,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
new_dentry->d_name.name,
- new_dentry->d_name.len);
+ new_dentry->d_name.len,
+ &new_rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
}
@@ -9188,46 +9228,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (new_inode->i_nlink == 1)
BTRFS_I(new_inode)->dir_index = new_idx;
- if (root_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
- old_dentry->d_parent);
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
-out_fail:
/*
- * If we have pinned a log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
*/
- if (ret && (root_log_pinned || dest_log_pinned)) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))
- btrfs_set_log_full_commit(trans);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_pin_log_trans(dest);
- if (root_log_pinned) {
- btrfs_end_log_trans(root);
- root_log_pinned = false;
- }
- if (dest_log_pinned) {
- btrfs_end_log_trans(dest);
- dest_log_pinned = false;
- }
- }
+ /* Do the log updates for all inodes. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ old_rename_ctx.index, new_dentry->d_parent);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
+ new_rename_ctx.index, old_dentry->d_parent);
+
+ /* Now unpin the logs. */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(root);
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_end_log_trans(dest);
+out_fail:
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9302,11 +9327,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
+ struct btrfs_rename_ctx rename_ctx;
u64 index = 0;
int ret;
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
- bool log_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9411,29 +9436,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
- /*
- * Now pin the log. We do it to ensure that no other task can
- * sync the log while we are in progress with the rename, as
- * that could result in an inconsistency in case any of the
- * inodes that are part of this rename operation were logged
- * before.
- *
- * We pin the log even if at this precise moment none of the
- * inodes was logged before. This is because right after we
- * checked for that, some other task fsyncing some other inode
- * not involved with this rename operation could log that one of
- * our inodes exists.
- *
- * We don't need to pin the logs before the above call to
- * btrfs_insert_inode_ref(), since that does not need to change
- * a log.
- */
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
- old_dentry->d_name.len);
+ old_dentry->d_name.len,
+ &rename_ctx);
if (!ret)
ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
}
@@ -9475,12 +9482,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (log_pinned) {
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- new_dentry->d_parent);
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
+ rename_ctx.index, new_dentry->d_parent);
if (flags & RENAME_WHITEOUT) {
ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
@@ -9492,28 +9496,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
}
}
out_fail:
- /*
- * If we have pinned the log and an error happened, we unpin tasks
- * trying to sync the log and force them to fallback to a transaction
- * commit if the log currently contains any of the inodes involved in
- * this rename operation (to ensure we do not persist a log with an
- * inconsistent state for any of these inodes or leading to any
- * inconsistencies when replayed). If the transaction was aborted, the
- * abortion reason is propagated to userspace when attempting to commit
- * the transaction. If the log does not contain any of these inodes, we
- * allow the tasks to sync it.
- */
- if (ret && log_pinned) {
- if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
- btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
- (new_inode &&
- btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
- btrfs_set_log_full_commit(trans);
-
- btrfs_end_log_trans(root);
- log_pinned = false;
- }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -9993,8 +9975,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em = alloc_extent_map();
if (!em) {
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
goto next;
}
@@ -10076,11 +10057,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
min_size, actual_len, alloc_hint, trans);
}
-static int btrfs_set_page_dirty(struct page *page)
-{
- return __set_page_dirty_nobuffers(page);
-}
-
static int btrfs_permission(struct user_namespace *mnt_userns,
struct inode *inode, int mask)
{
@@ -10182,6 +10158,747 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
}
}
+static int btrfs_encoded_io_compression_from_extent(
+ struct btrfs_fs_info *fs_info,
+ int compress_type)
+{
+ switch (compress_type) {
+ case BTRFS_COMPRESS_NONE:
+ return BTRFS_ENCODED_IO_COMPRESSION_NONE;
+ case BTRFS_COMPRESS_ZLIB:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
+ case BTRFS_COMPRESS_LZO:
+ /*
+ * The LZO format depends on the sector size. 64K is the maximum
+ * sector size that we support.
+ */
+ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
+ return -EINVAL;
+ return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
+ (fs_info->sectorsize_bits - 12);
+ case BTRFS_COMPRESS_ZSTD:
+ return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
+ default:
+ return -EUCLEAN;
+ }
+}
+
+static ssize_t btrfs_encoded_read_inline(
+ struct kiocb *iocb,
+ struct iov_iter *iter, u64 start,
+ u64 lockend,
+ struct extent_state **cached_state,
+ u64 extent_start, size_t count,
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_file_extent_item *item;
+ u64 ram_bytes;
+ unsigned long ptr;
+ void *tmp;
+ ssize_t ret;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ extent_start, 0);
+ if (ret) {
+ if (ret > 0) {
+ /* The extent item disappeared? */
+ ret = -EIO;
+ }
+ goto out;
+ }
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
+
+ ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ ptr = btrfs_file_extent_inline_start(item);
+
+ encoded->len = min_t(u64, extent_start + ram_bytes,
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ btrfs_file_extent_compression(leaf, item));
+ if (ret < 0)
+ goto out;
+ encoded->compression = ret;
+ if (encoded->compression) {
+ size_t inline_size;
+
+ inline_size = btrfs_file_extent_inline_item_len(leaf,
+ path->slots[0]);
+ if (inline_size > count) {
+ ret = -ENOBUFS;
+ goto out;
+ }
+ count = inline_size;
+ encoded->unencoded_len = ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - extent_start;
+ } else {
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ ptr += iocb->ki_pos - extent_start;
+ }
+
+ tmp = kmalloc(count, GFP_NOFS);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ read_extent_buffer(leaf, tmp, ptr, count);
+ btrfs_release_path(path);
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ ret = copy_to_iter(tmp, count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ kfree(tmp);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+struct btrfs_encoded_read_private {
+ struct btrfs_inode *inode;
+ u64 file_offset;
+ wait_queue_head_t wait;
+ atomic_t pending;
+ blk_status_t status;
+ bool skip_csum;
+};
+
+static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
+ struct bio *bio, int mirror_num)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ blk_status_t ret;
+
+ if (!priv->skip_csum) {
+ ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL);
+ if (ret)
+ return ret;
+ }
+
+ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
+ if (ret) {
+ btrfs_bio_free_csum(bbio);
+ return ret;
+ }
+
+ atomic_inc(&priv->pending);
+ ret = btrfs_map_bio(fs_info, bio, mirror_num);
+ if (ret) {
+ atomic_dec(&priv->pending);
+ btrfs_bio_free_csum(bbio);
+ }
+ return ret;
+}
+
+static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
+{
+ const bool uptodate = (bbio->bio.bi_status == BLK_STS_OK);
+ struct btrfs_encoded_read_private *priv = bbio->bio.bi_private;
+ struct btrfs_inode *inode = priv->inode;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u32 sectorsize = fs_info->sectorsize;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+ u64 start = priv->file_offset;
+ u32 bio_offset = 0;
+
+ if (priv->skip_csum || !uptodate)
+ return bbio->bio.bi_status;
+
+ bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ unsigned int i, nr_sectors, pgoff;
+
+ nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
+ pgoff = bvec->bv_offset;
+ for (i = 0; i < nr_sectors; i++) {
+ ASSERT(pgoff < PAGE_SIZE);
+ if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+ bvec->bv_page, pgoff, start))
+ return BLK_STS_IOERR;
+ start += sectorsize;
+ bio_offset += sectorsize;
+ pgoff += sectorsize;
+ }
+ }
+ return BLK_STS_OK;
+}
+
+static void btrfs_encoded_read_endio(struct bio *bio)
+{
+ struct btrfs_encoded_read_private *priv = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
+ blk_status_t status;
+
+ status = btrfs_encoded_read_verify_csum(bbio);
+ if (status) {
+ /*
+ * The memory barrier implied by the atomic_dec_return() here
+ * pairs with the memory barrier implied by the
+ * atomic_dec_return() or io_wait_event() in
+ * btrfs_encoded_read_regular_fill_pages() to ensure that this
+ * write is observed before the load of status in
+ * btrfs_encoded_read_regular_fill_pages().
+ */
+ WRITE_ONCE(priv->status, status);
+ }
+ if (!atomic_dec_return(&priv->pending))
+ wake_up(&priv->wait);
+ btrfs_bio_free_csum(bbio);
+ bio_put(bio);
+}
+
+static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ u64 file_offset,
+ u64 disk_bytenr,
+ u64 disk_io_size,
+ struct page **pages)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct btrfs_encoded_read_private priv = {
+ .inode = inode,
+ .file_offset = file_offset,
+ .pending = ATOMIC_INIT(1),
+ .skip_csum = (inode->flags & BTRFS_INODE_NODATASUM),
+ };
+ unsigned long i = 0;
+ u64 cur = 0;
+ int ret;
+
+ init_waitqueue_head(&priv.wait);
+ /*
+ * Submit bios for the extent, splitting due to bio or stripe limits as
+ * necessary.
+ */
+ while (cur < disk_io_size) {
+ struct extent_map *em;
+ struct btrfs_io_geometry geom;
+ struct bio *bio = NULL;
+ u64 remaining;
+
+ em = btrfs_get_chunk_map(fs_info, disk_bytenr + cur,
+ disk_io_size - cur);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ } else {
+ ret = btrfs_get_io_geometry(fs_info, em, BTRFS_MAP_READ,
+ disk_bytenr + cur, &geom);
+ free_extent_map(em);
+ }
+ if (ret) {
+ WRITE_ONCE(priv.status, errno_to_blk_status(ret));
+ break;
+ }
+ remaining = min(geom.len, disk_io_size - cur);
+ while (bio || remaining) {
+ size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+
+ if (!bio) {
+ bio = btrfs_bio_alloc(BIO_MAX_VECS);
+ bio->bi_iter.bi_sector =
+ (disk_bytenr + cur) >> SECTOR_SHIFT;
+ bio->bi_end_io = btrfs_encoded_read_endio;
+ bio->bi_private = &priv;
+ bio->bi_opf = REQ_OP_READ;
+ }
+
+ if (!bytes ||
+ bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+ blk_status_t status;
+
+ status = submit_encoded_read_bio(inode, bio, 0);
+ if (status) {
+ WRITE_ONCE(priv.status, status);
+ bio_put(bio);
+ goto out;
+ }
+ bio = NULL;
+ continue;
+ }
+
+ i++;
+ cur += bytes;
+ remaining -= bytes;
+ }
+ }
+
+out:
+ if (atomic_dec_return(&priv.pending))
+ io_wait_event(priv.wait, !atomic_read(&priv.pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ return blk_status_to_errno(READ_ONCE(priv.status));
+}
+
+static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
+ struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ bool *unlocked)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ unsigned long nr_pages, i;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_NOFS);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
+ disk_io_size, pages);
+ if (ret)
+ goto out;
+
+ unlock_extent_cached(io_tree, start, lockend, cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ *unlocked = true;
+
+ if (compressed) {
+ i = 0;
+ page_offset = 0;
+ } else {
+ i = (iocb->ki_pos - start) >> PAGE_SHIFT;
+ page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
+ }
+ cur = 0;
+ while (cur < count) {
+ size_t bytes = min_t(size_t, count - cur,
+ PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(pages[i], page_offset, bytes,
+ iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+ i++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = count;
+out:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+ return ret;
+}
+
+ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
+ struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ ssize_t ret;
+ size_t count = iov_iter_count(iter);
+ u64 start, lockend, disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
+ struct extent_map *em;
+ bool unlocked = false;
+
+ file_accessed(iocb->ki_filp);
+
+ btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+
+ if (iocb->ki_pos >= inode->vfs_inode.i_size) {
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return 0;
+ }
+ start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
+ /*
+ * We don't know how long the extent containing iocb->ki_pos is, but if
+ * it's compressed we know that it won't be longer than this.
+ */
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+ lock_extent_bits(io_tree, start, lockend, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ cond_resched();
+ }
+
+ em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_unlock_extent;
+ }
+
+ if (em->block_start == EXTENT_MAP_INLINE) {
+ u64 extent_start = em->start;
+
+ /*
+ * For inline extents we get everything we need out of the
+ * extent item.
+ */
+ free_extent_map(em);
+ em = NULL;
+ ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
+ &cached_state, extent_start,
+ count, encoded, &unlocked);
+ goto out;
+ }
+
+ /*
+ * We only want to return up to EOF even if the extent extends beyond
+ * that.
+ */
+ encoded->len = min_t(u64, extent_map_end(em),
+ inode->vfs_inode.i_size) - iocb->ki_pos;
+ if (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ disk_bytenr = EXTENT_MAP_HOLE;
+ count = min_t(u64, count, encoded->len);
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ disk_bytenr = em->block_start;
+ /*
+ * Bail if the buffer isn't large enough to return the whole
+ * compressed extent.
+ */
+ if (em->block_len > count) {
+ ret = -ENOBUFS;
+ goto out_em;
+ }
+ disk_io_size = count = em->block_len;
+ encoded->unencoded_len = em->ram_bytes;
+ encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
+ ret = btrfs_encoded_io_compression_from_extent(fs_info,
+ em->compress_type);
+ if (ret < 0)
+ goto out_em;
+ encoded->compression = ret;
+ } else {
+ disk_bytenr = em->block_start + (start - em->start);
+ if (encoded->len > count)
+ encoded->len = count;
+ /*
+ * Don't read beyond what we locked. This also limits the page
+ * allocations that we'll do.
+ */
+ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + disk_io_size - iocb->ki_pos;
+ encoded->len = count;
+ encoded->unencoded_len = count;
+ disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ }
+ free_extent_map(em);
+ em = NULL;
+
+ if (disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ unlocked = true;
+ ret = iov_iter_zero(count, iter);
+ if (ret != count)
+ ret = -EFAULT;
+ } else {
+ ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ encoded->compression,
+ &unlocked);
+ }
+
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+out_em:
+ free_extent_map(em);
+out_unlock_extent:
+ if (!unlocked)
+ unlock_extent_cached(io_tree, start, lockend, &cached_state);
+out_unlock_inode:
+ if (!unlocked)
+ btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+}
+
+ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct btrfs_ioctl_encoded_io_args *encoded)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct extent_changeset *data_reserved = NULL;
+ struct extent_state *cached_state = NULL;
+ int compression;
+ size_t orig_count;
+ u64 start, end;
+ u64 num_bytes, ram_bytes, disk_num_bytes;
+ unsigned long nr_pages, i;
+ struct page **pages;
+ struct btrfs_key ins;
+ bool extent_reserved = false;
+ struct extent_map *em;
+ ssize_t ret;
+
+ switch (encoded->compression) {
+ case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
+ compression = BTRFS_COMPRESS_ZLIB;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
+ compression = BTRFS_COMPRESS_ZSTD;
+ break;
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
+ case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
+ /* The sector size must match for LZO. */
+ if (encoded->compression -
+ BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
+ fs_info->sectorsize_bits)
+ return -EINVAL;
+ compression = BTRFS_COMPRESS_LZO;
+ break;
+ default:
+ return -EINVAL;
+ }
+ if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ return -EINVAL;
+
+ orig_count = iov_iter_count(from);
+
+ /* The extent size must be sane. */
+ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
+ orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
+ return -EINVAL;
+
+ /*
+ * The compressed data must be smaller than the decompressed data.
+ *
+ * It's of course possible for data to compress to larger or the same
+ * size, but the buffered I/O path falls back to no compression for such
+ * data, and we don't want to break any assumptions by creating these
+ * extents.
+ *
+ * Note that this is less strict than the current check we have that the
+ * compressed data must be at least one sector smaller than the
+ * decompressed data. We only want to enforce the weaker requirement
+ * from old kernels that it is at least one byte smaller.
+ */
+ if (orig_count >= encoded->unencoded_len)
+ return -EINVAL;
+
+ /* The extent must start on a sector boundary. */
+ start = iocb->ki_pos;
+ if (!IS_ALIGNED(start, fs_info->sectorsize))
+ return -EINVAL;
+
+ /*
+ * The extent must end on a sector boundary. However, we allow a write
+ * which ends at or extends i_size to have an unaligned length; we round
+ * up the extent size and set i_size to the unaligned end.
+ */
+ if (start + encoded->len < inode->vfs_inode.i_size &&
+ !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
+ return -EINVAL;
+
+ /* Finally, the offset in the unencoded data must be sector-aligned. */
+ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
+ return -EINVAL;
+
+ num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
+ ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
+ end = start + num_bytes - 1;
+
+ /*
+ * If the extent cannot be inline, the compressed data on disk must be
+ * sector-aligned. For convenience, we extend it with zeroes if it
+ * isn't.
+ */
+ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
+ nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
+ pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!pages)
+ return -ENOMEM;
+ for (i = 0; i < nr_pages; i++) {
+ size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
+ char *kaddr;
+
+ pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ goto out_pages;
+ }
+ kaddr = kmap(pages[i]);
+ if (copy_from_iter(kaddr, bytes, from) != bytes) {
+ kunmap(pages[i]);
+ ret = -EFAULT;
+ goto out_pages;
+ }
+ if (bytes < PAGE_SIZE)
+ memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
+ kunmap(pages[i]);
+ }
+
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
+ if (ret)
+ goto out_pages;
+ ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
+ start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
+ if (ret)
+ goto out_pages;
+ lock_extent_bits(io_tree, start, end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
+ if (!ordered &&
+ !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
+ break;
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+ cond_resched();
+ }
+
+ /*
+ * We don't use the higher-level delalloc space functions because our
+ * num_bytes and disk_num_bytes are different.
+ */
+ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
+ if (ret)
+ goto out_unlock;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
+ if (ret)
+ goto out_free_data_space;
+ ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
+ if (ret)
+ goto out_qgroup_free_data;
+
+ /* Try an inline extent first. */
+ if (start == 0 && encoded->unencoded_len == encoded->len &&
+ encoded->unencoded_offset == 0) {
+ ret = cow_file_range_inline(inode, encoded->len, orig_count,
+ compression, pages, true);
+ if (ret <= 0) {
+ if (ret == 0)
+ ret = orig_count;
+ goto out_delalloc_release;
+ }
+ }
+
+ ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
+ disk_num_bytes, 0, 0, &ins, 1, 1);
+ if (ret)
+ goto out_delalloc_release;
+ extent_reserved = true;
+
+ em = create_io_em(inode, start, num_bytes,
+ start - encoded->unencoded_offset, ins.objectid,
+ ins.offset, ins.offset, ram_bytes, compression,
+ BTRFS_ORDERED_COMPRESSED);
+ if (IS_ERR(em)) {
+ ret = PTR_ERR(em);
+ goto out_free_reserved;
+ }
+ free_extent_map(em);
+
+ ret = btrfs_add_ordered_extent(inode, start, num_bytes, ram_bytes,
+ ins.objectid, ins.offset,
+ encoded->unencoded_offset,
+ (1 << BTRFS_ORDERED_ENCODED) |
+ (1 << BTRFS_ORDERED_COMPRESSED),
+ compression);
+ if (ret) {
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ goto out_free_reserved;
+ }
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+
+ if (start + encoded->len > inode->vfs_inode.i_size)
+ i_size_write(&inode->vfs_inode, start + encoded->len);
+
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+
+ btrfs_delalloc_release_extents(inode, num_bytes);
+
+ if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+ ins.offset, pages, nr_pages, 0, NULL,
+ false)) {
+ btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
+ ret = -EIO;
+ goto out_pages;
+ }
+ ret = orig_count;
+ goto out;
+
+out_free_reserved:
+ btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+out_delalloc_release:
+ btrfs_delalloc_release_extents(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
+out_qgroup_free_data:
+ if (ret < 0)
+ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
+out_free_data_space:
+ /*
+ * If btrfs_reserve_extent() succeeded, then we already decremented
+ * bytes_may_use.
+ */
+ if (!extent_reserved)
+ btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+out_unlock:
+ unlock_extent_cached(io_tree, start, end, &cached_state);
+out_pages:
+ for (i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kvfree(pages);
+out:
+ if (ret >= 0)
+ iocb->ki_pos += encoded->len;
+ return ret;
+}
+
#ifdef CONFIG_SWAP
/*
* Add an entry indicating a block group or device which is pinned by a
@@ -10638,12 +11355,12 @@ static const struct address_space_operations btrfs_aops = {
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
.direct_IO = noop_direct_IO,
- .invalidatepage = btrfs_invalidatepage,
+ .invalidate_folio = btrfs_invalidate_folio,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
.migratepage = btrfs_migratepage,
#endif
- .set_page_dirty = btrfs_set_page_dirty,
+ .dirty_folio = filemap_dirty_folio,
.error_remove_page = generic_error_remove_page,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d8af6620a941..238cee5b5254 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -28,6 +28,7 @@
#include <linux/iversion.h>
#include <linux/fileattr.h>
#include <linux/fsverity.h>
+#include <linux/sched/xacct.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -88,6 +89,24 @@ struct btrfs_ioctl_send_args_32 {
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
struct btrfs_ioctl_send_args_32)
+
+struct btrfs_ioctl_encoded_io_args_32 {
+ compat_uptr_t iov;
+ compat_ulong_t iovcnt;
+ __s64 offset;
+ __u64 flags;
+ __u64 len;
+ __u64 unencoded_len;
+ __u64 unencoded_offset;
+ __u32 compression;
+ __u32 encryption;
+ __u8 reserved[64];
+};
+
+#define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
+#define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
+ struct btrfs_ioctl_encoded_io_args_32)
#endif
/* Mask out flags that are inappropriate for the given type of inode. */
@@ -440,10 +459,8 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
}
}
-static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
{
- struct inode *inode = file_inode(file);
-
return put_user(inode->i_generation, arg);
}
@@ -753,6 +770,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_trans_handle *trans;
int ret;
+ /* We do not support snapshotting right now. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_warn(fs_info,
+ "extent tree v2 doesn't support snapshotting yet");
+ return -EOPNOTSUPP;
+ }
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -805,10 +829,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto fail;
}
- spin_lock(&fs_info->trans_lock);
- list_add(&pending_snapshot->list,
- &trans->transaction->pending_snapshots);
- spin_unlock(&fs_info->trans_lock);
+ trans->pending_snapshot = pending_snapshot;
ret = btrfs_commit_transaction(trans);
if (ret)
@@ -1015,8 +1036,155 @@ out:
return ret;
}
+/*
+ * Defrag specific helper to get an extent map.
+ *
+ * Differences between this and btrfs_get_extent() are:
+ *
+ * - No extent_map will be added to inode->extent_tree
+ * To reduce memory usage in the long run.
+ *
+ * - Extra optimization to skip file extents older than @newer_than
+ * By using btrfs_search_forward() we can skip entire file ranges that
+ * have extents created in past transactions, because btrfs_search_forward()
+ * will not visit leaves and nodes with a generation smaller than given
+ * minimal generation threshold (@newer_than).
+ *
+ * Return valid em if we find a file extent matching the requirement.
+ * Return NULL if we can not find a file extent matching the requirement.
+ *
+ * Return ERR_PTR() for error.
+ */
+static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
+ u64 start, u64 newer_than)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_path path = { 0 };
+ struct extent_map *em;
+ struct btrfs_key key;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ if (newer_than) {
+ ret = btrfs_search_forward(root, &key, &path, newer_than);
+ if (ret < 0)
+ goto err;
+ /* Can't find anything newer */
+ if (ret > 0)
+ goto not_found;
+ } else {
+ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
+ if (ret < 0)
+ goto err;
+ }
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
+ /*
+ * If btrfs_search_slot() makes path to point beyond nritems,
+ * we should not have an empty leaf, as this inode must at
+ * least have its INODE_ITEM.
+ */
+ ASSERT(btrfs_header_nritems(path.nodes[0]));
+ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
+ }
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ /* Perfect match, no need to go one slot back */
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
+ key.offset == start)
+ goto iterate;
+
+ /* We didn't find a perfect match, needs to go one slot back */
+ if (path.slots[0] > 0) {
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
+ path.slots[0]--;
+ }
+
+iterate:
+ /* Iterate through the path to find a file extent covering @start */
+ while (true) {
+ u64 extent_end;
+
+ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
+ goto next;
+
+ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
+
+ /*
+ * We may go one slot back to INODE_REF/XATTR item, then
+ * need to go forward until we reach an EXTENT_DATA.
+ * But we should still has the correct ino as key.objectid.
+ */
+ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
+ goto next;
+
+ /* It's beyond our target range, definitely not extent found */
+ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
+ goto not_found;
+
+ /*
+ * | |<- File extent ->|
+ * \- start
+ *
+ * This means there is a hole between start and key.offset.
+ */
+ if (key.offset > start) {
+ em->start = start;
+ em->orig_start = start;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->len = key.offset - start;
+ break;
+ }
+
+ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
+ struct btrfs_file_extent_item);
+ extent_end = btrfs_file_extent_end(&path);
+
+ /*
+ * |<- file extent ->| |
+ * \- start
+ *
+ * We haven't reached start, search next slot.
+ */
+ if (extent_end <= start)
+ goto next;
+
+ /* Now this extent covers @start, convert it to em */
+ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
+ break;
+next:
+ ret = btrfs_next_item(root, &path);
+ if (ret < 0)
+ goto err;
+ if (ret > 0)
+ goto not_found;
+ }
+ btrfs_release_path(&path);
+ return em;
+
+not_found:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return NULL;
+
+err:
+ btrfs_release_path(&path);
+ free_extent_map(em);
+ return ERR_PTR(ret);
+}
+
static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
- bool locked)
+ u64 newer_than, bool locked)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -1031,6 +1199,20 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
em = lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
+ /*
+ * We can get a merged extent, in that case, we need to re-search
+ * tree to get the original em for defrag.
+ *
+ * If @newer_than is 0 or em::generation < newer_than, we can trust
+ * this em, as either we don't care about the generation, or the
+ * merged extent map will be rejected anyway.
+ */
+ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ newer_than && em->generation >= newer_than) {
+ free_extent_map(em);
+ em = NULL;
+ }
+
if (!em) {
struct extent_state *cached = NULL;
u64 end = start + sectorsize - 1;
@@ -1038,7 +1220,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* get the big lock and read metadata off disk */
if (!locked)
lock_extent_bits(io_tree, start, end, &cached);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
+ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
unlock_extent_cached(io_tree, start, end, &cached);
@@ -1049,23 +1231,42 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
return em;
}
+static u32 get_extent_max_capacity(const struct extent_map *em)
+{
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ return BTRFS_MAX_COMPRESSED;
+ return BTRFS_MAX_EXTENT_SIZE;
+}
+
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
bool locked)
{
struct extent_map *next;
- bool ret = true;
+ bool ret = false;
/* this is the last extent */
if (em->start + em->len >= i_size_read(inode))
return false;
- next = defrag_lookup_extent(inode, em->start + em->len, locked);
+ /*
+ * We want to check if the next extent can be merged with the current
+ * one, which can be an extent created in a past generation, so we pass
+ * a minimum generation of 0 to defrag_lookup_extent().
+ */
+ next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
+ /* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- ret = false;
- else if ((em->block_start + em->block_len == next->block_start) &&
- (em->block_len > SZ_128K && next->block_len > SZ_128K))
- ret = false;
-
+ goto out;
+ if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ goto out;
+ /*
+ * If the next extent is at its max capacity, defragging current extent
+ * makes no sense, as the total number of extents won't change.
+ */
+ if (next->len >= get_extent_max_capacity(em))
+ goto out;
+ ret = true;
+out:
free_extent_map(next);
return ret;
}
@@ -1189,8 +1390,10 @@ struct defrag_target_range {
static int defrag_collect_targets(struct btrfs_inode *inode,
u64 start, u64 len, u32 extent_thresh,
u64 newer_than, bool do_compress,
- bool locked, struct list_head *target_list)
+ bool locked, struct list_head *target_list,
+ u64 *last_scanned_ret)
{
+ bool last_is_target = false;
u64 cur = start;
int ret = 0;
@@ -1200,7 +1403,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
bool next_mergeable = true;
u64 range_len;
- em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
+ last_is_target = false;
+ em = defrag_lookup_extent(&inode->vfs_inode, cur,
+ newer_than, locked);
if (!em)
break;
@@ -1213,6 +1418,10 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (em->generation < newer_than)
goto next;
+ /* This em is under writeback, no need to defrag */
+ if (em->generation == (u64)-1)
+ goto next;
+
/*
* Our start offset might be in the middle of an existing extent
* map, so take that into account.
@@ -1253,6 +1462,13 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (range_len >= extent_thresh)
goto next;
+ /*
+ * Skip extents already at its max capacity, this is mostly for
+ * compressed extents, which max cap is only 128K.
+ */
+ if (em->len >= get_extent_max_capacity(em))
+ goto next;
+
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
locked);
if (!next_mergeable) {
@@ -1271,6 +1487,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
}
add:
+ last_is_target = true;
range_len = min(extent_map_end(em), start + len) - cur;
/*
* This one is a good target, check if it can be merged into
@@ -1314,10 +1531,22 @@ next:
kfree(entry);
}
}
+ if (!ret && last_scanned_ret) {
+ /*
+ * If the last extent is not a target, the caller can skip to
+ * the end of that extent.
+ * Otherwise, we can only go the end of the specified range.
+ */
+ if (!last_is_target)
+ *last_scanned_ret = max(cur, *last_scanned_ret);
+ else
+ *last_scanned_ret = max(start + len, *last_scanned_ret);
+ }
return ret;
}
#define CLUSTER_SIZE (SZ_256K)
+static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
/*
* Defrag one contiguous target range.
@@ -1372,7 +1601,8 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
}
static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
- u32 extent_thresh, u64 newer_than, bool do_compress)
+ u32 extent_thresh, u64 newer_than, bool do_compress,
+ u64 *last_scanned_ret)
{
struct extent_state *cached_state = NULL;
struct defrag_target_range *entry;
@@ -1418,7 +1648,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
*/
ret = defrag_collect_targets(inode, start, len, extent_thresh,
newer_than, do_compress, true,
- &target_list);
+ &target_list, last_scanned_ret);
if (ret < 0)
goto unlock_extent;
@@ -1453,7 +1683,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
u64 start, u32 len, u32 extent_thresh,
u64 newer_than, bool do_compress,
unsigned long *sectors_defragged,
- unsigned long max_sectors)
+ unsigned long max_sectors,
+ u64 *last_scanned_ret)
{
const u32 sectorsize = inode->root->fs_info->sectorsize;
struct defrag_target_range *entry;
@@ -1461,10 +1692,9 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
LIST_HEAD(target_list);
int ret;
- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
ret = defrag_collect_targets(inode, start, len, extent_thresh,
newer_than, do_compress, false,
- &target_list);
+ &target_list, NULL);
if (ret < 0)
goto out;
@@ -1481,6 +1711,15 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
range_len = min_t(u32, range_len,
(max_sectors - *sectors_defragged) * sectorsize);
+ /*
+ * If defrag_one_range() has updated last_scanned_ret,
+ * our range may already be invalid (e.g. hole punched).
+ * Skip if our range is before last_scanned_ret, as there is
+ * no need to defrag the range anymore.
+ */
+ if (entry->start + range_len <= *last_scanned_ret)
+ continue;
+
if (ra)
page_cache_sync_readahead(inode->vfs_inode.i_mapping,
ra, NULL, entry->start >> PAGE_SHIFT,
@@ -1493,7 +1732,8 @@ static int defrag_one_cluster(struct btrfs_inode *inode,
* accounting.
*/
ret = defrag_one_range(inode, entry->start, range_len,
- extent_thresh, newer_than, do_compress);
+ extent_thresh, newer_than, do_compress,
+ last_scanned_ret);
if (ret < 0)
break;
*sectors_defragged += range_len >>
@@ -1504,6 +1744,8 @@ out:
list_del_init(&entry->list);
kfree(entry);
}
+ if (ret >= 0)
+ *last_scanned_ret = max(*last_scanned_ret, start + len);
return ret;
}
@@ -1589,11 +1831,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
+ u64 last_scanned = cur;
u64 cluster_end;
- /* The cluster size 256K should always be page aligned */
- BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
-
if (btrfs_defrag_cancelled(fs_info)) {
ret = -EAGAIN;
break;
@@ -1618,8 +1858,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
BTRFS_I(inode)->defrag_compress = compress_type;
ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress,
- &sectors_defragged, max_to_defrag);
+ newer_than, do_compress, &sectors_defragged,
+ max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1627,11 +1867,12 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
- cur = cluster_end + 1;
+ cur = max(cluster_end + 1, last_scanned);
if (ret > 0) {
ret = 0;
break;
}
+ cond_resched();
}
if (ra_allocated)
@@ -2009,10 +2250,9 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
void __user *arg)
{
- struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
@@ -2342,12 +2582,11 @@ err:
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct file *file,
- void __user *argp)
+static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+ void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs;
struct btrfs_ioctl_search_key sk;
- struct inode *inode;
int ret;
size_t buf_size;
@@ -2361,7 +2600,6 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
buf_size = sizeof(uargs->buf);
- inode = file_inode(file);
ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
/*
@@ -2376,12 +2614,11 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg;
struct btrfs_ioctl_search_args_v2 args;
- struct inode *inode;
int ret;
size_t buf_size;
const size_t buf_limit = SZ_16M;
@@ -2400,7 +2637,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
if (buf_size > buf_limit)
buf_size = buf_limit;
- inode = file_inode(file);
ret = search_ioctl(inode, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
@@ -2651,25 +2887,22 @@ out:
return ret;
}
-static noinline int btrfs_ioctl_ino_lookup(struct file *file,
+static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_ino_lookup_args *args;
- struct inode *inode;
int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
return PTR_ERR(args);
- inode = file_inode(file);
-
/*
* Unprivileged query to obtain the containing subvolume root id. The
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
- args->treeid = BTRFS_I(inode)->root->root_key.objectid;
+ args->treeid = root->root_key.objectid;
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
@@ -2681,7 +2914,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
goto out;
}
- ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
+ ret = btrfs_search_path_in_tree(root->fs_info,
args->treeid, args->objectid,
args->name);
@@ -2737,7 +2970,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
}
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
-static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
{
struct btrfs_ioctl_get_subvol_info_args *subvol_info;
struct btrfs_fs_info *fs_info;
@@ -2749,7 +2982,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
struct extent_buffer *leaf;
unsigned long item_off;
unsigned long item_len;
- struct inode *inode;
int slot;
int ret = 0;
@@ -2763,7 +2995,6 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
return -ENOMEM;
}
- inode = file_inode(file);
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
@@ -2857,15 +3088,14 @@ out_free:
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name.
*/
-static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
+static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
+ void __user *argp)
{
struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
struct btrfs_root_ref *rref;
- struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
- struct inode *inode;
u64 objectid;
int slot;
int ret;
@@ -2881,15 +3111,13 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
return PTR_ERR(rootrefs);
}
- inode = file_inode(file);
- root = BTRFS_I(inode)->root->fs_info->tree_root;
- objectid = BTRFS_I(inode)->root->root_key.objectid;
-
+ objectid = root->root_key.objectid;
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
found = 0;
+ root = root->fs_info->tree_root;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
@@ -2969,6 +3197,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
int err = 0;
bool destroy_parent = false;
+ /* We don't support snapshots with extent tree v2 yet. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "extent tree v2 doesn't support snapshot deletion yet");
+ return -EOPNOTSUPP;
+ }
+
if (destroy_v2) {
vol_args2 = memdup_user(arg, sizeof(*vol_args2));
if (IS_ERR(vol_args2))
@@ -3244,6 +3479,11 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
@@ -3354,7 +3594,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
struct block_device *bdev = NULL;
fmode_t mode;
int ret;
- bool cancel;
+ bool cancel = false;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -3769,6 +4009,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
@@ -3868,6 +4113,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
p = memdup_user(arg, sizeof(*p));
if (IS_ERR(p))
return PTR_ERR(p);
@@ -4929,7 +5179,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4959,11 +5209,194 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(file, arg);
+ ret = btrfs_ioctl_send(inode, arg);
kfree(arg);
return ret;
}
+static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
+ bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
+ flags);
+ size_t copy_end;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
+ flags);
+ if (copy_from_user(&args32, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, argp, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+ if (args.flags != 0) {
+ ret = -EINVAL;
+ goto out_acct;
+ }
+
+ ret = import_iovec(READ, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_iov;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_iov;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ if (ret >= 0) {
+ fsnotify_access(file);
+ if (copy_to_user(argp + copy_end,
+ (char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel))
+ ret = -EFAULT;
+ }
+
+out_iov:
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+ return ret;
+}
+
+static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
+{
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+
+ if (!(file->f_mode & FMODE_WRITE)) {
+ ret = -EBADF;
+ goto out_acct;
+ }
+
+ if (compat) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ if (copy_from_user(&args32, argp, sizeof(args32))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+ args.len = args32.len;
+ args.unencoded_len = args32.unencoded_len;
+ args.unencoded_offset = args32.unencoded_offset;
+ args.compression = args32.compression;
+ args.encryption = args32.encryption;
+ memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ if (copy_from_user(&args, argp, sizeof(args))) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ ret = -EINVAL;
+ if (args.flags != 0)
+ goto out_acct;
+ if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
+ goto out_acct;
+ if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
+ args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ goto out_acct;
+ if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
+ args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
+ goto out_acct;
+ if (args.unencoded_offset > args.unencoded_len)
+ goto out_acct;
+ if (args.len > args.unencoded_len - args.unencoded_offset)
+ goto out_acct;
+
+ ret = import_iovec(WRITE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ file_start_write(file);
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_end_write;
+ }
+ pos = args.offset;
+ ret = rw_verify_area(WRITE, file, &pos, args.len);
+ if (ret < 0)
+ goto out_end_write;
+
+ init_sync_kiocb(&kiocb, file);
+ ret = kiocb_set_rw_flags(&kiocb, 0);
+ if (ret)
+ goto out_end_write;
+ kiocb.ki_pos = pos;
+
+ ret = btrfs_do_write_iter(&kiocb, &iter, &args);
+ if (ret > 0)
+ fsnotify_modify(file);
+
+out_end_write:
+ file_end_write(file);
+ kfree(iov);
+out_acct:
+ if (ret > 0)
+ add_wchar(current, ret);
+ inc_syscw(current);
+ return ret;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -4974,7 +5407,7 @@ long btrfs_ioctl(struct file *file, unsigned int
switch (cmd) {
case FS_IOC_GETVERSION:
- return btrfs_ioctl_getversion(file, argp);
+ return btrfs_ioctl_getversion(inode, argp);
case FS_IOC_GETFSLABEL:
return btrfs_ioctl_get_fslabel(fs_info, argp);
case FS_IOC_SETFSLABEL:
@@ -4994,7 +5427,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SNAP_DESTROY_V2:
return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(file, argp);
+ return btrfs_ioctl_subvol_getflags(inode, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -5018,11 +5451,11 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(file, argp);
+ return btrfs_ioctl_tree_search(inode, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(file, argp);
+ return btrfs_ioctl_tree_search_v2(inode, argp);
case BTRFS_IOC_INO_LOOKUP:
- return btrfs_ioctl_ino_lookup(file, argp);
+ return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
@@ -5069,10 +5502,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(file, argp, false);
+ return _btrfs_ioctl_send(inode, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(file, argp, true);
+ return _btrfs_ioctl_send(inode, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
@@ -5099,15 +5532,25 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
- return btrfs_ioctl_get_subvol_info(file, argp);
+ return btrfs_ioctl_get_subvol_info(inode, argp);
case BTRFS_IOC_GET_SUBVOL_ROOTREF:
- return btrfs_ioctl_get_subvol_rootref(file, argp);
+ return btrfs_ioctl_get_subvol_rootref(root, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
case FS_IOC_ENABLE_VERITY:
return fsverity_ioctl_enable(file, (const void __user *)argp);
case FS_IOC_MEASURE_VERITY:
return fsverity_ioctl_measure(file, argp);
+ case BTRFS_IOC_ENCODED_READ:
+ return btrfs_ioctl_encoded_read(file, argp, false);
+ case BTRFS_IOC_ENCODED_WRITE:
+ return btrfs_ioctl_encoded_write(file, argp, false);
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+ return btrfs_ioctl_encoded_read(file, argp, true);
+ case BTRFS_IOC_ENCODED_WRITE_32:
+ return btrfs_ioctl_encoded_write(file, argp, true);
+#endif
}
return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 0fb90cbe7669..430ad36b8b08 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -55,6 +55,9 @@
* 0x1000 | SegHdr N+1| Data payload N+1 ... |
*/
+#define WORKSPACE_BUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+#define WORKSPACE_CBUF_LENGTH (lzo1x_worst_compress(PAGE_SIZE))
+
struct workspace {
void *mem;
void *buf; /* where decompressed data goes */
@@ -83,8 +86,8 @@ struct list_head *lzo_alloc_workspace(unsigned int level)
return ERR_PTR(-ENOMEM);
workspace->mem = kvmalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
- workspace->buf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
- workspace->cbuf = kvmalloc(lzo1x_worst_compress(PAGE_SIZE), GFP_KERNEL);
+ workspace->buf = kvmalloc(WORKSPACE_BUF_LENGTH, GFP_KERNEL);
+ workspace->cbuf = kvmalloc(WORKSPACE_CBUF_LENGTH, GFP_KERNEL);
if (!workspace->mem || !workspace->buf || !workspace->cbuf)
goto fail;
@@ -380,6 +383,17 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
kunmap(cur_page);
cur_in += LZO_LEN;
+ if (seg_len > WORKSPACE_CBUF_LENGTH) {
+ /*
+ * seg_len shouldn't be larger than we have allocated
+ * for workspace->cbuf
+ */
+ btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
+ seg_len);
+ ret = -EIO;
+ goto out;
+ }
+
/* Copy the compressed segment payload into workspace */
copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in);
@@ -422,7 +436,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in,
struct workspace *workspace = list_entry(ws, struct workspace, list);
size_t in_len;
size_t out_len;
- size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE);
+ size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
char *kaddr;
unsigned long bytes;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 6b51fd2ec5ac..1957b14b329a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -143,16 +143,28 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
-/*
- * Allocate and add a new ordered_extent into the per-inode tree.
+/**
+ * Add an ordered extent to the per-inode tree.
+ *
+ * @inode: Inode that this extent is for.
+ * @file_offset: Logical offset in file where the extent starts.
+ * @num_bytes: Logical length of extent in file.
+ * @ram_bytes: Full length of unencoded data.
+ * @disk_bytenr: Offset of extent on disk.
+ * @disk_num_bytes: Size of extent on disk.
+ * @offset: Offset into unencoded data where file data starts.
+ * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @compress_type: Compression algorithm used for data.
*
- * The tree is given a single reference on the ordered extent that was
- * inserted.
+ * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+ * tree is given a single reference on the ordered extent that was inserted.
+ *
+ * Return: 0 or -ENOMEM.
*/
-static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type, int dio,
- int compress_type)
+int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -161,7 +173,8 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
struct btrfs_ordered_extent *entry;
int ret;
- if (type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_PREALLOC) {
+ if (flags &
+ ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
/* For nocow write, we can release the qgroup rsv right now */
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
if (ret < 0)
@@ -181,9 +194,11 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
return -ENOMEM;
entry->file_offset = file_offset;
- entry->disk_bytenr = disk_bytenr;
entry->num_bytes = num_bytes;
+ entry->ram_bytes = ram_bytes;
+ entry->disk_bytenr = disk_bytenr;
entry->disk_num_bytes = disk_num_bytes;
+ entry->offset = offset;
entry->bytes_left = num_bytes;
entry->inode = igrab(&inode->vfs_inode);
entry->compress_type = compress_type;
@@ -191,18 +206,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
entry->qgroup_rsv = ret;
entry->physical = (u64)-1;
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC ||
- type == BTRFS_ORDERED_COMPRESSED);
- set_bit(type, &entry->flags);
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+ entry->flags = flags;
percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
fs_info->delalloc_batch);
- if (dio)
- set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
-
/* one ref for the tree */
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
@@ -247,41 +256,6 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
return 0;
}
-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type)
-{
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 0,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type)
-{
- ASSERT(type == BTRFS_ORDERED_REGULAR ||
- type == BTRFS_ORDERED_NOCOW ||
- type == BTRFS_ORDERED_PREALLOC);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes, type, 1,
- BTRFS_COMPRESS_NONE);
-}
-
-int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int compress_type)
-{
- ASSERT(compress_type != BTRFS_COMPRESS_NONE);
- return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr,
- num_bytes, disk_num_bytes,
- BTRFS_ORDERED_COMPRESSED, 0,
- compress_type);
-}
-
/*
* Add a struct btrfs_ordered_sum into the list of checksums to be inserted
* when an ordered extent is finished. If the list covers more than one
@@ -548,9 +522,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
spin_lock(&btrfs_inode->lock);
btrfs_mod_outstanding_extents(btrfs_inode, -1);
spin_unlock(&btrfs_inode->lock);
- if (root != fs_info->tree_root)
- btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes,
- false);
+ if (root != fs_info->tree_root) {
+ u64 release;
+
+ if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
+ release = entry->disk_num_bytes;
+ else
+ release = entry->num_bytes;
+ btrfs_delalloc_release_metadata(btrfs_inode, release, false);
+ }
percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
fs_info->delalloc_batch);
@@ -1052,42 +1032,18 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
u64 file_offset = ordered->file_offset + pos;
u64 disk_bytenr = ordered->disk_bytenr + pos;
- u64 num_bytes = len;
- u64 disk_num_bytes = len;
- int type;
- unsigned long flags_masked = ordered->flags & ~(1 << BTRFS_ORDERED_DIRECT);
- int compress_type = ordered->compress_type;
- unsigned long weight;
- int ret;
-
- weight = hweight_long(flags_masked);
- WARN_ON_ONCE(weight > 1);
- if (!weight)
- type = 0;
- else
- type = __ffs(flags_masked);
+ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
/*
- * The splitting extent is already counted and will be added again
- * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid
- * double counting.
+ * The splitting extent is already counted and will be added again in
+ * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
*/
- percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes,
+ percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
fs_info->delalloc_batch);
- if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) {
- WARN_ON_ONCE(1);
- ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode),
- file_offset, disk_bytenr, num_bytes,
- disk_num_bytes, compress_type);
- } else if (test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- ret = btrfs_add_ordered_extent_dio(BTRFS_I(inode), file_offset,
- disk_bytenr, num_bytes, disk_num_bytes, type);
- } else {
- ret = btrfs_add_ordered_extent(BTRFS_I(inode), file_offset,
- disk_bytenr, num_bytes, disk_num_bytes, type);
- }
-
- return ret;
+ WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
+ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+ disk_bytenr, len, 0, flags,
+ ordered->compress_type);
}
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4194e960ff61..ecad67a2c745 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,8 +74,18 @@ enum {
BTRFS_ORDERED_LOGGED_CSUM,
/* We wait for this extent to complete in the current transaction */
BTRFS_ORDERED_PENDING,
+ /* BTRFS_IOC_ENCODED_WRITE */
+ BTRFS_ORDERED_ENCODED,
};
+/* BTRFS_ORDERED_* flags that specify the type of the extent. */
+#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \
+ (1UL << BTRFS_ORDERED_NOCOW) | \
+ (1UL << BTRFS_ORDERED_PREALLOC) | \
+ (1UL << BTRFS_ORDERED_COMPRESSED) | \
+ (1UL << BTRFS_ORDERED_DIRECT) | \
+ (1UL << BTRFS_ORDERED_ENCODED))
+
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -84,9 +94,11 @@ struct btrfs_ordered_extent {
* These fields directly correspond to the same fields in
* btrfs_file_extent_item.
*/
- u64 disk_bytenr;
u64 num_bytes;
+ u64 ram_bytes;
+ u64 disk_bytenr;
u64 disk_num_bytes;
+ u64 offset;
/* number of bytes that still need writing */
u64 bytes_left;
@@ -179,14 +191,9 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size);
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes,
- int type);
-int btrfs_add_ordered_extent_dio(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int type);
-int btrfs_add_ordered_extent_compress(struct btrfs_inode *inode, u64 file_offset,
- u64 disk_bytenr, u64 num_bytes,
- u64 disk_num_bytes, int compress_type);
+ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+ u64 disk_num_bytes, u64 offset, unsigned flags,
+ int compress_type);
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0775ae9f4419..dd8777872143 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -23,6 +23,7 @@ static const struct root_name_map root_map[] = {
{ BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
{ BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
{ BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" },
{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
};
@@ -391,9 +392,9 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
btrfs_header_owner(c),
btrfs_node_ptr_generation(c, i),
level - 1, &first_key);
- if (IS_ERR(next)) {
+ if (IS_ERR(next))
continue;
- } else if (!extent_buffer_uptodate(next)) {
+ if (!extent_buffer_uptodate(next)) {
free_extent_buffer(next);
continue;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 8928275823a1..1866b1f0da01 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -25,18 +25,6 @@
#include "sysfs.h"
#include "tree-mod-log.h"
-/* TODO XXX FIXME
- * - subvol delete -> delete when ref goes to 0? delete limits also?
- * - reorganize keys
- * - compressed
- * - sync
- * - copy also limits on subvol creation
- * - limit
- * - caches for ulists
- * - performance benchmarks
- * - check all ioctl parameters
- */
-
/*
* Helpers to access qgroup reservation
*
@@ -258,16 +246,19 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return 0;
}
-/* must be called with qgroup_lock held */
-static int add_relation_rb(struct btrfs_fs_info *fs_info,
- u64 memberid, u64 parentid)
+/*
+ * Add relation specified by two qgroups.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the qgroups is NULL
+ * <0 other errors
+ */
+static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent)
{
- struct btrfs_qgroup *member;
- struct btrfs_qgroup *parent;
struct btrfs_qgroup_list *list;
- member = find_qgroup_rb(fs_info, memberid);
- parent = find_qgroup_rb(fs_info, parentid);
if (!member || !parent)
return -ENOENT;
@@ -283,7 +274,27 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info,
return 0;
}
-/* must be called with qgroup_lock held */
+/*
+ * Add relation specified by two qgoup ids.
+ *
+ * Must be called with qgroup_lock held.
+ *
+ * Return: 0 on success
+ * -ENOENT if one of the ids does not exist
+ * <0 other errors
+ */
+static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid)
+{
+ struct btrfs_qgroup *member;
+ struct btrfs_qgroup *parent;
+
+ member = find_qgroup_rb(fs_info, memberid);
+ parent = find_qgroup_rb(fs_info, parentid);
+
+ return __add_relation_rb(member, parent);
+}
+
+/* Must be called with qgroup_lock held */
static int del_relation_rb(struct btrfs_fs_info *fs_info,
u64 memberid, u64 parentid)
{
@@ -948,6 +959,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "qgroups are currently unsupported in extent tree v2");
+ return -EINVAL;
+ }
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
@@ -1185,12 +1202,34 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to disable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
+
+ /*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
@@ -1205,14 +1244,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
if (!fs_info->quota_root)
goto out;
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -1430,7 +1468,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
}
spin_lock(&fs_info->qgroup_lock);
- ret = add_relation_rb(fs_info, src, dst);
+ ret = __add_relation_rb(member, parent);
if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
goto out;
@@ -3247,7 +3285,8 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3277,11 +3316,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = PTR_ERR(trans);
break;
}
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- err = -EINTR;
- } else {
- err = qgroup_rescan_leaf(trans, path);
- }
+
+ err = qgroup_rescan_leaf(trans, path);
+
if (err > 0)
btrfs_commit_transaction(trans);
else
@@ -3295,7 +3332,7 @@ out:
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0) {
+ } else if (err < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3383,6 +3420,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
}
if (ret) {
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index a3930da4eb3f..04a88bfe4fcf 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -277,7 +277,7 @@ copy_inline_extent:
path->slots[0]),
size);
btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(dst));
ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
out:
if (!ret && !trans) {
@@ -494,7 +494,8 @@ process_slot:
&clone_info, &trans);
if (ret)
goto out;
- } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ } else {
+ ASSERT(type == BTRFS_FILE_EXTENT_INLINE);
/*
* Inline extents always have to start at file offset 0
* and can never be bigger then the sector size. We can
@@ -505,8 +506,12 @@ process_slot:
*/
ASSERT(key.offset == 0);
ASSERT(datal <= fs_info->sectorsize);
- if (key.offset != 0 || datal > fs_info->sectorsize)
- return -EUCLEAN;
+ if (WARN_ON(type != BTRFS_FILE_EXTENT_INLINE) ||
+ WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = clone_copy_inline_extent(inode, path, &new_key,
drop_start, datal, size,
@@ -518,17 +523,22 @@ process_slot:
btrfs_release_path(path);
/*
- * If this is a new extent update the last_reflink_trans of both
- * inodes. This is used by fsync to make sure it does not log
- * multiple checksum items with overlapping ranges. For older
- * extents we don't need to do it since inode logging skips the
- * checksums for older extents. Also ignore holes and inline
- * extents because they don't have checksums in the csum tree.
+ * Whenever we share an extent we update the last_reflink_trans
+ * of each inode to the current transaction. This is needed to
+ * make sure fsync does not log multiple checksum items with
+ * overlapping ranges (because some extent items might refer
+ * only to sections of the original extent). For the destination
+ * inode we do this regardless of the generation of the extents
+ * or even if they are inline extents or explicit holes, to make
+ * sure a full fsync does not skip them. For the source inode,
+ * we only need to update last_reflink_trans in case it's a new
+ * extent that is not a hole or an inline extent, to deal with
+ * the checksums problem on fsync.
*/
- if (extent_gen == trans->transid && disko > 0) {
+ if (extent_gen == trans->transid && disko > 0)
BTRFS_I(src)->last_reflink_trans = trans->transid;
- BTRFS_I(inode)->last_reflink_trans = trans->transid;
- }
+
+ BTRFS_I(inode)->last_reflink_trans = trans->transid;
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
@@ -575,8 +585,7 @@ process_slot:
* replaced file extent items.
*/
if (last_dest_end >= i_size_read(inode))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ btrfs_set_inode_full_sync(BTRFS_I(inode));
ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
last_dest_end, destoff + len - 1, NULL, &trans);
@@ -772,9 +781,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (btrfs_root_readonly(root_out))
return -EROFS;
- if (file_in->f_path.mnt != file_out->f_path.mnt ||
- inode_in->i_sb != inode_out->i_sb)
- return -EXDEV;
+ ASSERT(inode_in->i_sb == inode_out->i_sb);
}
/* Don't make the dst file partly checksummed */
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f5465197996d..fdc2c4b411f0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2599,9 +2599,9 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
eb = read_tree_block(fs_info, block->bytenr, block->owner,
block->key.offset, block->level, NULL);
- if (IS_ERR(eb)) {
+ if (IS_ERR(eb))
return PTR_ERR(eb);
- } else if (!extent_buffer_uptodate(eb)) {
+ if (!extent_buffer_uptodate(eb)) {
free_extent_buffer(eb);
return -EIO;
}
@@ -2997,7 +2997,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- clamped_len);
+ clamped_len, clamped_len);
if (ret)
goto release_page;
@@ -3960,6 +3960,19 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
int rw = 0;
int err = 0;
+ /*
+ * This only gets set if we had a half-deleted snapshot on mount. We
+ * cannot allow relocation to start while we're still trying to clean up
+ * these pending deletions.
+ */
+ ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
+ if (ret)
+ return ret;
+
+ /* We may have been woken up by close_ctree, so bail if we're closing. */
+ if (btrfs_fs_closing(fs_info))
+ return -EINTR;
+
bg = btrfs_lookup_block_group(fs_info, group_start);
if (!bg)
return -ENOENT;
@@ -4110,9 +4123,8 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
* this function resumes merging reloc trees with corresponding fs trees.
* this is important for keeping the sharing of tree blocks
*/
-int btrfs_recover_relocation(struct btrfs_root *root)
+int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
LIST_HEAD(reloc_roots);
struct btrfs_key key;
struct btrfs_root *fs_root;
@@ -4153,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
key.type != BTRFS_ROOT_ITEM_KEY)
break;
- reloc_root = btrfs_read_tree_root(root, &key);
+ reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
if (IS_ERR(reloc_root)) {
err = PTR_ERR(reloc_root);
goto out;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 3d68d2dcd83e..ca7426ef61c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -278,6 +278,21 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info)
WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state));
if (btrfs_root_refs(&root->root_item) == 0) {
+ struct btrfs_key drop_key;
+
+ btrfs_disk_key_to_cpu(&drop_key, &root->root_item.drop_progress);
+ /*
+ * If we have a non-zero drop_progress then we know we
+ * made it partly through deleting this snapshot, and
+ * thus we need to make sure we block any balance from
+ * happening until this snapshot is completely dropped.
+ */
+ if (drop_key.objectid != 0 || drop_key.type != 0 ||
+ drop_key.offset != 0) {
+ set_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
+ set_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state);
+ }
+
set_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
btrfs_add_dead_root(root);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2e9a322773f2..11089568b287 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3190,7 +3190,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 generation;
int mirror_num;
struct btrfs_key key;
- u64 increment = map->stripe_len;
+ u64 increment;
u64 offset;
u64 extent_logical;
u64 extent_physical;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d8ccb62aa7d2..7d1642937274 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -528,17 +528,12 @@ out:
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
{
- int ret;
-
p->reversed = from->reversed;
fs_path_reset(p);
- ret = fs_path_add_path(p, from);
-
- return ret;
+ return fs_path_add_path(p, from);
}
-
static void fs_path_unreverse(struct fs_path *p)
{
char *tmp;
@@ -4999,6 +4994,10 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
@@ -7473,10 +7472,10 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
root->root_key.objectid, root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
+ struct btrfs_root *send_root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 23bcefc84e49..08602fdd600a 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -126,7 +126,7 @@ enum {
#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1)
#ifdef __KERNEL__
-long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg);
#endif
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 294242c194d8..b87931a458eb 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -737,6 +737,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
u64 thresh = div_factor_fine(space_info->total_bytes, 90);
u64 used;
+ lockdep_assert_held(&space_info->lock);
+
/* If we're just plain full then async reclaim just slows us down. */
if ((space_info->bytes_used + space_info->bytes_reserved +
global_rsv_size) >= thresh)
@@ -1061,7 +1063,6 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
trans_rsv->reserved;
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
- spin_unlock(&space_info->lock);
/*
* We don't want to include the global_rsv in our calculation,
@@ -1092,6 +1093,8 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
flush = FLUSH_DELAYED_REFS_NR;
}
+ spin_unlock(&space_info->lock);
+
/*
* We don't want to reclaim everything, just a portion, so scale
* down the to_reclaim by 1/4. If it takes us down to 0,
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 29bd8c7a7706..ef7ae20d2b77 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -736,7 +736,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers))
+ if (atomic_read(&subpage->writers) == 0)
/* No writers, locked by plain lock_page() */
return unlock_page(page);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4d947ba32da9..b228efe8ab6e 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,52 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+#ifdef CONFIG_PRINTK
+
+#define STATE_STRING_PREFACE ": state "
+#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
+
+/*
+ * Characters to print to indicate error conditions or uncommon filesystem sate.
+ * RO is not an error.
+ */
+static const char fs_state_chars[] = {
+ [BTRFS_FS_STATE_ERROR] = 'E',
+ [BTRFS_FS_STATE_REMOUNTING] = 'M',
+ [BTRFS_FS_STATE_RO] = 0,
+ [BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
+ [BTRFS_FS_STATE_DEV_REPLACING] = 'R',
+ [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
+ [BTRFS_FS_STATE_NO_CSUMS] = 'C',
+ [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
+};
+
+static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
+{
+ unsigned int bit;
+ bool states_printed = false;
+ unsigned long fs_state = READ_ONCE(info->fs_state);
+ char *curr = buf;
+
+ memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
+ curr += sizeof(STATE_STRING_PREFACE) - 1;
+
+ for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
+ WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
+ if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
+ *curr++ = fs_state_chars[bit];
+ states_printed = true;
+ }
+ }
+
+ /* If no states were printed, reset the buffer */
+ if (!states_printed)
+ curr = buf;
+
+ *curr++ = 0;
+}
+#endif
+
/*
* Generally the error codes correspond to their respective errors, but there
* are a few special cases.
@@ -128,6 +174,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
+ char statestr[STATE_STRING_BUF_LEN];
const char *errstr;
#endif
@@ -140,6 +187,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
+ btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
va_list args;
@@ -148,12 +196,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function
vaf.fmt = fmt;
vaf.va = &args;
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
- sb->s_id, function, line, errno, errstr, &vaf);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
+ sb->s_id, statestr, function, line, errno, errstr, &vaf);
va_end(args);
} else {
- pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
- sb->s_id, function, line, errno, errstr);
+ pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
+ sb->s_id, statestr, function, line, errno, errstr);
}
#endif
@@ -240,11 +288,15 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
vaf.va = &args;
if (__ratelimit(ratelimit)) {
- if (fs_info)
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
- fs_info->sb->s_id, &vaf);
- else
+ if (fs_info) {
+ char statestr[STATE_STRING_BUF_LEN];
+
+ btrfs_state_to_string(fs_info, statestr);
+ printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
+ fs_info->sb->s_id, statestr, &vaf);
+ } else {
printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
+ }
}
va_end(args);
@@ -861,6 +913,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
break;
case Opt_space_cache:
case Opt_space_cache_version:
+ /*
+ * We already set FREE_SPACE_TREE above because we have
+ * compat_ro(FREE_SPACE_TREE) set, and we aren't going
+ * to allow v1 to be set for extent tree v2, simply
+ * ignore this setting if we're extent tree v2.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (token == Opt_space_cache ||
strcmp(args[0].from, "v1") == 0) {
btrfs_clear_opt(info->mount_opt,
@@ -881,6 +941,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
+ /*
+ * We cannot operate without the free space tree with
+ * extent tree v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
if (btrfs_test_opt(info, SPACE_CACHE)) {
btrfs_clear_and_info(info, SPACE_CACHE,
"disabling disk space caching");
@@ -896,6 +962,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
"the 'inode_cache' option is deprecated and has no effect since 5.11");
break;
case Opt_clear_cache:
+ /*
+ * We cannot clear the free space tree with extent tree
+ * v2, ignore this option.
+ */
+ if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
+ break;
btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
@@ -2383,6 +2455,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
{
struct btrfs_ioctl_vol_args *vol;
struct btrfs_device *device = NULL;
+ dev_t devt = 0;
int ret = -ENOTTY;
if (!capable(CAP_SYS_ADMIN))
@@ -2402,7 +2475,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
mutex_unlock(&uuid_mutex);
break;
case BTRFS_IOC_FORGET_DEV:
- ret = btrfs_forget_devices(vol->name);
+ if (vol->name[0] != 0) {
+ ret = lookup_bdev(vol->name, &devt);
+ if (ret)
+ break;
+ }
+ ret = btrfs_forget_devices(devt);
break;
case BTRFS_IOC_DEVICES_READY:
mutex_lock(&uuid_mutex);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index beb7f72d50b8..17389a42a3ab 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -283,9 +283,11 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID);
BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
-/* Remove once support for zoned allocation is feature complete */
#ifdef CONFIG_BTRFS_DEBUG
+/* Remove once support for zoned allocation is feature complete */
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
+/* Remove once support for extent tree v2 is feature complete */
+BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2);
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY);
@@ -314,6 +316,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid1c34),
#ifdef CONFIG_BTRFS_DEBUG
BTRFS_FEAT_ATTR_PTR(zoned),
+ BTRFS_FEAT_ATTR_PTR(extent_tree_v2),
#endif
#ifdef CONFIG_FS_VERITY
BTRFS_FEAT_ATTR_PTR(verity),
@@ -1104,6 +1107,11 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX];
static struct btrfs_feature_attr btrfs_feature_attrs[FEAT_MAX][NUM_FEATURE_BITS];
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names) ==
+ ARRAY_SIZE(btrfs_feature_attrs));
+static_assert(ARRAY_SIZE(btrfs_unknown_feature_names[0]) ==
+ ARRAY_SIZE(btrfs_feature_attrs[0]));
+
static const u64 supported_feature_masks[FEAT_MAX] = {
[FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
[FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
@@ -1272,11 +1280,6 @@ static void init_feature_attrs(void)
struct btrfs_feature_attr *fa;
int set, i;
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
- ARRAY_SIZE(btrfs_feature_attrs));
- BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
- ARRAY_SIZE(btrfs_feature_attrs[0]));
-
memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
memset(btrfs_unknown_feature_names, 0,
sizeof(btrfs_unknown_feature_names));
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 319fed82d741..c5b3a631bf4f 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -15,6 +15,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
+ write_lock(&em_tree->lock);
while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
@@ -32,6 +33,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#endif
free_extent_map(em);
}
+ write_unlock(&em_tree->lock);
}
/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 03de89b45f27..b008c5110958 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -854,7 +854,37 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
static noinline void wait_for_commit(struct btrfs_transaction *commit,
const enum btrfs_trans_state min_state)
{
- wait_event(commit->commit_wait, commit->state >= min_state);
+ struct btrfs_fs_info *fs_info = commit->fs_info;
+ u64 transid = commit->transid;
+ bool put = false;
+
+ while (1) {
+ wait_event(commit->commit_wait, commit->state >= min_state);
+ if (put)
+ btrfs_put_transaction(commit);
+
+ if (min_state < TRANS_STATE_COMPLETED)
+ break;
+
+ /*
+ * A transaction isn't really completed until all of the
+ * previous transactions are completed, but with fsync we can
+ * end up with SUPER_COMMITTED transactions before a COMPLETED
+ * transaction. Wait for those.
+ */
+
+ spin_lock(&fs_info->trans_lock);
+ commit = list_first_entry_or_null(&fs_info->trans_list,
+ struct btrfs_transaction,
+ list);
+ if (!commit || commit->transid > transid) {
+ spin_unlock(&fs_info->trans_lock);
+ break;
+ }
+ refcount_inc(&commit->use_count);
+ put = true;
+ spin_unlock(&fs_info->trans_lock);
+ }
}
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
@@ -1320,6 +1350,32 @@ again:
}
/*
+ * If we had a pending drop we need to see if there are any others left in our
+ * dead roots list, and if not clear our bit and wake any waiters.
+ */
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * We put the drop in progress roots at the front of the list, so if the
+ * first entry doesn't have UNFINISHED_DROP set we can wake everybody
+ * up.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root,
+ root_list);
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
+ spin_unlock(&fs_info->trans_lock);
+ return;
+ }
+ }
+ spin_unlock(&fs_info->trans_lock);
+
+ btrfs_wake_unfinished_drop(fs_info);
+}
+
+/*
* dead roots are old snapshots that need to be deleted. This allocates
* a dirty root struct and adds it into the list of dead roots that need to
* be deleted
@@ -1331,7 +1387,12 @@ void btrfs_add_dead_root(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (list_empty(&root->root_list)) {
btrfs_grab_root(root);
- list_add_tail(&root->root_list, &fs_info->dead_roots);
+
+ /* We want to process the partially complete drops first. */
+ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
+ list_add(&root->root_list, &fs_info->dead_roots);
+ else
+ list_add_tail(&root->root_list, &fs_info->dead_roots);
}
spin_unlock(&fs_info->trans_lock);
}
@@ -1850,6 +1911,14 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
+
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ root_item = &fs_info->block_group_root->root_item;
+
+ super->block_group_root = root_item->bytenr;
+ super->block_group_root_generation = root_item->generation;
+ super->block_group_root_level = root_item->level;
+ }
}
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -1981,16 +2050,24 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
/*
- * We use writeback_inodes_sb here because if we used
+ * We use try_to_writeback_inodes_sb() here because if we used
* btrfs_start_delalloc_roots we would deadlock with fs freeze.
* Currently are holding the fs freeze lock, if we do an async flush
* we'll do btrfs_join_transaction() and deadlock because we need to
* wait for the fs freeze lock. Using the direct flushing we benefit
* from already being in a transaction and our join_transaction doesn't
* have to re-take the fs freeze lock.
+ *
+ * Note that try_to_writeback_inodes_sb() will only trigger writeback
+ * if it can read lock sb->s_umount. It will always be able to lock it,
+ * except when the filesystem is being unmounted or being frozen, but in
+ * those cases sync_filesystem() is called, which results in calling
+ * writeback_inodes_sb() while holding a write lock on sb->s_umount.
+ * Note that we don't call writeback_inodes_sb() directly, because it
+ * will emit a warning if sb->s_umount is not locked.
*/
if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
- writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
+ try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
return 0;
}
@@ -2000,6 +2077,27 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
}
+/*
+ * Add a pending snapshot associated with the given transaction handle to the
+ * respective handle. This must be called after the transaction commit started
+ * and while holding fs_info->trans_lock.
+ * This serves to guarantee a caller of btrfs_commit_transaction() that it can
+ * safely free the pending snapshot pointer in case btrfs_commit_transaction()
+ * returns an error.
+ */
+static void add_pending_snapshot(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ if (!trans->pending_snapshot)
+ return;
+
+ lockdep_assert_held(&trans->fs_info->trans_lock);
+ ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);
+
+ list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2073,6 +2171,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
+ add_pending_snapshot(trans);
+
spin_unlock(&fs_info->trans_lock);
refcount_inc(&cur_trans->use_count);
@@ -2163,6 +2263,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
*/
spin_lock(&fs_info->trans_lock);
+ add_pending_snapshot(trans);
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
wait_event(cur_trans->writer_wait,
@@ -2269,6 +2370,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
list_add_tail(&fs_info->chunk_root->dirty_list,
&cur_trans->switch_commits);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_set_root_node(&fs_info->block_group_root->root_item,
+ fs_info->block_group_root->node);
+ list_add_tail(&fs_info->block_group_root->dirty_list,
+ &cur_trans->switch_commits);
+ }
+
switch_commit_roots(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
@@ -2397,10 +2505,10 @@ cleanup_transaction:
* because btrfs_commit_super will poke cleaner thread and it will process it a
* few seconds later.
*/
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
{
+ struct btrfs_root *root;
int ret;
- struct btrfs_fs_info *fs_info = root->fs_info;
spin_lock(&fs_info->trans_lock);
if (list_empty(&fs_info->dead_roots)) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 1852ed9de7fd..970ff316069d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -123,6 +123,8 @@ struct btrfs_trans_handle {
struct btrfs_transaction *transaction;
struct btrfs_block_rsv *block_rsv;
struct btrfs_block_rsv *orig_rsv;
+ /* Set by a task that wants to create a snapshot. */
+ struct btrfs_pending_snapshot *pending_snapshot;
refcount_t use_count;
unsigned int type;
/*
@@ -214,7 +216,8 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid);
void btrfs_add_dead_root(struct btrfs_root *root);
int btrfs_defrag_root(struct btrfs_root *root);
-int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
+void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 72e1c942197d..e56c0107eea3 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -639,8 +639,10 @@ static void block_group_err(const struct extent_buffer *eb, int slot,
static int check_block_group_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
+ struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_block_group_item bgi;
u32 item_size = btrfs_item_size(leaf, slot);
+ u64 chunk_objectid;
u64 flags;
u64 type;
@@ -663,8 +665,23 @@ static int check_block_group_item(struct extent_buffer *leaf,
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
sizeof(bgi));
- if (unlikely(btrfs_stack_block_group_chunk_objectid(&bgi) !=
- BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
+ chunk_objectid = btrfs_stack_block_group_chunk_objectid(&bgi);
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ /*
+ * We don't init the nr_global_roots until we load the global
+ * roots, so this could be 0 at mount time. If it's 0 we'll
+ * just assume we're fine, and later we'll check against our
+ * actual value.
+ */
+ if (unlikely(fs_info->nr_global_roots &&
+ chunk_objectid >= fs_info->nr_global_roots)) {
+ block_group_err(leaf, slot,
+ "invalid block group global root id, have %llu, needs to be <= %llu",
+ chunk_objectid,
+ fs_info->nr_global_roots);
+ return -EUCLEAN;
+ }
+ } else if (unlikely(chunk_objectid != BTRFS_FIRST_CHUNK_TREE_OBJECTID)) {
block_group_err(leaf, slot,
"invalid block group chunk objectid, have %llu expect %llu",
btrfs_stack_block_group_chunk_objectid(&bgi),
@@ -965,6 +982,7 @@ static int check_dev_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
struct btrfs_dev_item *ditem;
+ const u32 item_size = btrfs_item_size(leaf, slot);
if (unlikely(key->objectid != BTRFS_DEV_ITEMS_OBJECTID)) {
dev_item_err(leaf, slot,
@@ -972,6 +990,13 @@ static int check_dev_item(struct extent_buffer *leaf,
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
+
+ if (unlikely(item_size != sizeof(*ditem))) {
+ dev_item_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*ditem));
+ return -EUCLEAN;
+ }
+
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
if (unlikely(btrfs_device_id(leaf, ditem) != key->offset)) {
dev_item_err(leaf, slot,
@@ -1007,6 +1032,7 @@ static int check_inode_item(struct extent_buffer *leaf,
struct btrfs_inode_item *iitem;
u64 super_gen = btrfs_super_generation(fs_info->super_copy);
u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777);
+ const u32 item_size = btrfs_item_size(leaf, slot);
u32 mode;
int ret;
u32 flags;
@@ -1016,6 +1042,12 @@ static int check_inode_item(struct extent_buffer *leaf,
if (unlikely(ret < 0))
return ret;
+ if (unlikely(item_size != sizeof(*iitem))) {
+ generic_err(leaf, slot, "invalid item size: has %u expect %zu",
+ item_size, sizeof(*iitem));
+ return -EUCLEAN;
+ }
+
iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item);
/* Here we use super block generation + 1 to handle log tree */
@@ -1633,7 +1665,6 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
/* These trees must never be empty */
if (unlikely(owner == BTRFS_ROOT_TREE_OBJECTID ||
owner == BTRFS_CHUNK_TREE_OBJECTID ||
- owner == BTRFS_EXTENT_TREE_OBJECTID ||
owner == BTRFS_DEV_TREE_OBJECTID ||
owner == BTRFS_FS_TREE_OBJECTID ||
owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
@@ -1642,12 +1673,25 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
owner);
return -EUCLEAN;
}
+
/* Unknown tree */
if (unlikely(owner == 0)) {
generic_err(leaf, 0,
"invalid owner, root 0 is not defined");
return -EUCLEAN;
}
+
+ /* EXTENT_TREE_V2 can have empty extent trees. */
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ return 0;
+
+ if (unlikely(owner == BTRFS_EXTENT_TREE_OBJECTID)) {
+ generic_err(leaf, 0,
+ "invalid root, root %llu must never be empty",
+ owner);
+ return -EUCLEAN;
+ }
+
return 0;
}
@@ -1667,6 +1711,7 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
*/
for (slot = 0; slot < nritems; slot++) {
u32 item_end_expected;
+ u64 item_data_end;
int ret;
btrfs_item_key_to_cpu(leaf, &key, slot);
@@ -1681,6 +1726,8 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
return -EUCLEAN;
}
+ item_data_end = (u64)btrfs_item_offset(leaf, slot) +
+ btrfs_item_size(leaf, slot);
/*
* Make sure the offset and ends are right, remember that the
* item data starts at the end of the leaf and grows towards the
@@ -1691,11 +1738,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
else
item_end_expected = btrfs_item_offset(leaf,
slot - 1);
- if (unlikely(btrfs_item_data_end(leaf, slot) != item_end_expected)) {
+ if (unlikely(item_data_end != item_end_expected)) {
generic_err(leaf, slot,
- "unexpected item end, have %u expect %u",
- btrfs_item_data_end(leaf, slot),
- item_end_expected);
+ "unexpected item end, have %llu expect %u",
+ item_data_end, item_end_expected);
return -EUCLEAN;
}
@@ -1704,12 +1750,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
* just in case all the items are consistent to each other, but
* all point outside of the leaf.
*/
- if (unlikely(btrfs_item_data_end(leaf, slot) >
- BTRFS_LEAF_DATA_SIZE(fs_info))) {
+ if (unlikely(item_data_end > BTRFS_LEAF_DATA_SIZE(fs_info))) {
generic_err(leaf, slot,
- "slot end outside of leaf, have %u expect range [0, %u]",
- btrfs_item_data_end(leaf, slot),
- BTRFS_LEAF_DATA_SIZE(fs_info));
+ "slot end outside of leaf, have %llu expect range [0, %u]",
+ item_data_end, BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c1ddbe800897..571dae8ad65e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -270,12 +270,6 @@ void btrfs_end_log_trans(struct btrfs_root *root)
}
}
-static int btrfs_write_tree_block(struct extent_buffer *buf)
-{
- return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
- buf->start + buf->len - 1);
-}
-
static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
filemap_fdatawait_range(buf->pages[0]->mapping,
@@ -294,16 +288,6 @@ struct walk_control {
*/
int free;
- /* should we write out the extent buffer? This is used
- * while flushing the log tree to disk during a sync
- */
- int write;
-
- /* should we wait for the extent buffer io to finish? Also used
- * while flushing the log tree to disk for a sync
- */
- int wait;
-
/* pin only walk, we record which extents on disk belong to the
* log trees
*/
@@ -354,17 +338,15 @@ static int process_one_buffer(struct btrfs_root *log,
return ret;
}
- if (wc->pin)
+ if (wc->pin) {
ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
eb->len);
+ if (ret)
+ return ret;
- if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
- if (wc->pin && btrfs_header_level(eb) == 0)
+ if (btrfs_buffer_uptodate(eb, gen, 0) &&
+ btrfs_header_level(eb) == 0)
ret = btrfs_exclude_logged_extents(eb);
- if (wc->write)
- btrfs_write_tree_block(eb);
- if (wc->wait)
- btrfs_wait_tree_block_writeback(eb);
}
return ret;
}
@@ -917,6 +899,26 @@ out:
return ret;
}
+static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ const char *name,
+ int name_len)
+{
+ int ret;
+
+ ret = btrfs_unlink_inode(trans, dir, inode, name, name_len);
+ if (ret)
+ return ret;
+ /*
+ * Whenever we need to check if a name exists or not, we check the
+ * fs/subvolume tree. So after an unlink we must run delayed items, so
+ * that future checks for a name during log replay see that the name
+ * does not exists anymore.
+ */
+ return btrfs_run_delayed_items(trans);
+}
+
/*
* when cleaning up conflicts between the directory names in the
* subvolume, directory names in the log and directory names in the
@@ -959,12 +961,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name,
+ ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name,
name_len);
- if (ret)
- goto out;
- else
- ret = btrfs_run_delayed_items(trans);
out:
kfree(name);
iput(inode);
@@ -1124,14 +1122,11 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans, dir, inode,
+ ret = unlink_inode_for_log_replay(trans, dir, inode,
victim_name, victim_name_len);
kfree(victim_name);
if (ret)
return ret;
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- return ret;
*search_done = 1;
goto again;
}
@@ -1196,14 +1191,11 @@ again:
inc_nlink(&inode->vfs_inode);
btrfs_release_path(path);
- ret = btrfs_unlink_inode(trans,
+ ret = unlink_inode_for_log_replay(trans,
BTRFS_I(victim_parent),
inode,
victim_name,
victim_name_len);
- if (!ret)
- ret = btrfs_run_delayed_items(
- trans);
}
iput(victim_parent);
kfree(victim_name);
@@ -1358,7 +1350,7 @@ again:
kfree(name);
goto out;
}
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
inode, name, namelen);
kfree(name);
iput(dir);
@@ -1457,8 +1449,8 @@ static int add_link(struct btrfs_trans_handle *trans,
ret = -ENOENT;
goto out;
}
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode),
- name, namelen);
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(other_inode),
+ name, namelen);
if (ret)
goto out;
/*
@@ -1467,10 +1459,6 @@ static int add_link(struct btrfs_trans_handle *trans,
*/
if (other_inode->i_nlink == 0)
inc_nlink(other_inode);
-
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- goto out;
add_link:
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
name, namelen, 0, ref_index);
@@ -1603,7 +1591,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = btrfs_inode_ref_exists(inode, dir, key->type,
name, namelen);
if (ret > 0) {
- ret = btrfs_unlink_inode(trans,
+ ret = unlink_inode_for_log_replay(trans,
BTRFS_I(dir),
BTRFS_I(inode),
name, namelen);
@@ -2350,15 +2338,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
goto out;
inc_nlink(inode);
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- name_len);
- if (ret)
- goto out;
-
- ret = btrfs_run_delayed_items(trans);
- if (ret)
- goto out;
-
+ ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
+ name, name_len);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -3414,6 +3395,29 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
if (log->node) {
ret = walk_log_tree(trans, log, &wc);
if (ret) {
+ /*
+ * We weren't able to traverse the entire log tree, the
+ * typical scenario is getting an -EIO when reading an
+ * extent buffer of the tree, due to a previous writeback
+ * failure of it.
+ */
+ set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ &log->fs_info->fs_state);
+
+ /*
+ * Some extent buffers of the log tree may still be dirty
+ * and not yet written back to storage, because we may
+ * have updates to a log tree without syncing a log tree,
+ * such as during rename and link operations. So flush
+ * them out and wait for their writeback to complete, so
+ * that we properly cleanup their state and pages.
+ */
+ btrfs_write_marked_extents(log->fs_info,
+ &log->dirty_log_pages,
+ EXTENT_DIRTY | EXTENT_NEW);
+ btrfs_wait_tree_log_extents(log,
+ EXTENT_DIRTY | EXTENT_NEW);
+
if (trans)
btrfs_abort_transaction(trans, ret);
else
@@ -3454,35 +3458,156 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
- * Check if an inode was logged in the current transaction. This may often
- * return some false positives, because logged_trans is an in memory only field,
- * not persisted anywhere. This is meant to be used in contexts where a false
- * positive has no functional consequences.
+ * Check if an inode was logged in the current transaction. This correctly deals
+ * with the case where the inode was logged but has a logged_trans of 0, which
+ * happens if the inode is evicted and loaded again, as logged_trans is an in
+ * memory only field (not persisted).
+ *
+ * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
+ * and < 0 on error.
*/
-static bool inode_logged(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode)
+static int inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path_in)
{
+ struct btrfs_path *path = path_in;
+ struct btrfs_key key;
+ int ret;
+
if (inode->logged_trans == trans->transid)
- return true;
+ return 1;
- if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state))
- return false;
+ /*
+ * If logged_trans is not 0, then we know the inode logged was not logged
+ * in this transaction, so we can return false right away.
+ */
+ if (inode->logged_trans > 0)
+ return 0;
/*
- * The inode's logged_trans is always 0 when we load it (because it is
- * not persisted in the inode item or elsewhere). So if it is 0, the
- * inode was last modified in the current transaction then the inode may
- * have been logged before in the current transaction, then evicted and
- * loaded again in the current transaction - or may have never been logged
- * in the current transaction, but since we can not be sure, we have to
- * assume it was, otherwise our callers can leave an inconsistent log.
+ * If no log tree was created for this root in this transaction, then
+ * the inode can not have been logged in this transaction. In that case
+ * set logged_trans to anything greater than 0 and less than the current
+ * transaction's ID, to avoid the search below in a future call in case
+ * a log tree gets created after this.
*/
- if (inode->logged_trans == 0 &&
- inode->last_trans == trans->transid &&
- !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
- return true;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
- return false;
+ /*
+ * We have a log tree and the inode's logged_trans is 0. We can't tell
+ * for sure if the inode was logged before in this transaction by looking
+ * only at logged_trans. We could be pessimistic and assume it was, but
+ * that can lead to unnecessarily logging an inode during rename and link
+ * operations, and then further updating the log in followup rename and
+ * link operations, specially if it's a directory, which adds latency
+ * visible to applications doing a series of rename or link operations.
+ *
+ * A logged_trans of 0 here can mean several things:
+ *
+ * 1) The inode was never logged since the filesystem was mounted, and may
+ * or may have not been evicted and loaded again;
+ *
+ * 2) The inode was logged in a previous transaction, then evicted and
+ * then loaded again;
+ *
+ * 3) The inode was logged in the current transaction, then evicted and
+ * then loaded again.
+ *
+ * For cases 1) and 2) we don't want to return true, but we need to detect
+ * case 3) and return true. So we do a search in the log root for the inode
+ * item.
+ */
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ }
+
+ ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
+
+ if (path_in)
+ btrfs_release_path(path);
+ else
+ btrfs_free_path(path);
+
+ /*
+ * Logging an inode always results in logging its inode item. So if we
+ * did not find the item we know the inode was not logged for sure.
+ */
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ /*
+ * Set logged_trans to a value greater than 0 and less then the
+ * current transaction to avoid doing the search in future calls.
+ */
+ inode->logged_trans = trans->transid - 1;
+ return 0;
+ }
+
+ /*
+ * The inode was previously logged and then evicted, set logged_trans to
+ * the current transacion's ID, to avoid future tree searches as long as
+ * the inode is not evicted again.
+ */
+ inode->logged_trans = trans->transid;
+
+ /*
+ * If it's a directory, then we must set last_dir_index_offset to the
+ * maximum possible value, so that the next attempt to log the inode does
+ * not skip checking if dir index keys found in modified subvolume tree
+ * leaves have been logged before, otherwise it would result in attempts
+ * to insert duplicate dir index keys in the log tree. This must be done
+ * because last_dir_index_offset is an in-memory only field, not persisted
+ * in the inode item or any other on-disk structure, so its value is lost
+ * once the inode is evicted.
+ */
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->last_dir_index_offset = (u64)-1;
+
+ return 1;
+}
+
+/*
+ * Delete a directory entry from the log if it exists.
+ *
+ * Returns < 0 on error
+ * 1 if the entry does not exists
+ * 0 if the entry existed and was successfully deleted
+ */
+static int del_logged_dentry(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dir_ino,
+ const char *name, int name_len,
+ u64 index)
+{
+ struct btrfs_dir_item *di;
+
+ /*
+ * We only log dir index items of a directory, so we don't need to look
+ * for dir item keys.
+ */
+ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
+ index, name, name_len, -1);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
+ else if (!di)
+ return 1;
+
+ /*
+ * We do not need to update the size field of the directory's
+ * inode item because on log replay we update the field to reflect
+ * all existing entries in the directory (see overwrite_item()).
+ */
+ return btrfs_delete_one_dir_name(trans, log, path, di);
}
/*
@@ -3511,15 +3636,16 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir, u64 index)
{
- struct btrfs_root *log;
- struct btrfs_dir_item *di;
struct btrfs_path *path;
int ret;
- int err = 0;
- u64 dir_ino = btrfs_ino(dir);
- if (!inode_logged(trans, dir))
+ ret = inode_logged(trans, dir, NULL);
+ if (ret == 0)
return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
if (ret)
@@ -3527,41 +3653,18 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
mutex_lock(&dir->log_mutex);
- log = root->log_root;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out_unlock;
}
- /*
- * We only log dir index items of a directory, so we don't need to look
- * for dir item keys.
- */
- di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
- index, name, name_len, -1);
- if (IS_ERR(di)) {
- err = PTR_ERR(di);
- goto fail;
- }
- if (di) {
- ret = btrfs_delete_one_dir_name(trans, log, path, di);
- if (ret) {
- err = ret;
- goto fail;
- }
- }
-
- /*
- * We do not need to update the size field of the directory's inode item
- * because on log replay we update the field to reflect all existing
- * entries in the directory (see overwrite_item()).
- */
-fail:
+ ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
+ name, name_len, index);
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (err < 0)
+ if (ret < 0)
btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
}
@@ -3576,8 +3679,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
u64 index;
int ret;
- if (!inode_logged(trans, inode))
+ ret = inode_logged(trans, inode, NULL);
+ if (ret == 0)
return;
+ else if (ret < 0) {
+ btrfs_set_log_full_commit(trans);
+ return;
+ }
ret = join_running_log_trans(root);
if (ret)
@@ -3702,19 +3810,20 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_path *dst_path,
- struct btrfs_log_ctx *ctx)
+ struct btrfs_log_ctx *ctx,
+ u64 *last_old_dentry_offset)
{
struct btrfs_root *log = inode->root->log_root;
struct extent_buffer *src = path->nodes[0];
const int nritems = btrfs_header_nritems(src);
const u64 ino = btrfs_ino(inode);
- const bool inode_logged_before = inode_logged(trans, inode);
bool last_found = false;
int batch_start = 0;
int batch_size = 0;
int i;
for (i = path->slots[0]; i < nritems; i++) {
+ struct btrfs_dir_item *di;
struct btrfs_key key;
int ret;
@@ -3725,7 +3834,34 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
break;
}
+ di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
ctx->last_dir_item_offset = key.offset;
+
+ /*
+ * Skip ranges of items that consist only of dir item keys created
+ * in past transactions. However if we find a gap, we must log a
+ * dir index range item for that gap, so that index keys in that
+ * gap are deleted during log replay.
+ */
+ if (btrfs_dir_transid(src, di) < trans->transid) {
+ if (key.offset > *last_old_dentry_offset + 1) {
+ ret = insert_dir_log_key(trans, log, dst_path,
+ ino, *last_old_dentry_offset + 1,
+ key.offset - 1);
+ /*
+ * -EEXIST should never happen because when we
+ * log a directory in full mode (LOG_INODE_ALL)
+ * we drop all BTRFS_DIR_LOG_INDEX_KEY keys from
+ * the log tree.
+ */
+ ASSERT(ret != -EEXIST);
+ if (ret < 0)
+ return ret;
+ }
+
+ *last_old_dentry_offset = key.offset;
+ continue;
+ }
/*
* We must make sure that when we log a directory entry, the
* corresponding inode, after log replay, has a matching link
@@ -3749,25 +3885,23 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
* resulting in -ENOTEMPTY errors.
*/
if (!ctx->log_new_dentries) {
- struct btrfs_dir_item *di;
struct btrfs_key di_key;
- di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(src, di, &di_key);
- if ((btrfs_dir_transid(src, di) == trans->transid ||
- btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
- di_key.type != BTRFS_ROOT_ITEM_KEY)
+ if (di_key.type != BTRFS_ROOT_ITEM_KEY)
ctx->log_new_dentries = true;
}
- if (!inode_logged_before)
+ if (!ctx->logged_before)
goto add_to_batch;
/*
* If we were logged before and have logged dir items, we can skip
* checking if any item with a key offset larger than the last one
* we logged is in the log tree, saving time and avoiding adding
- * contention on the log tree.
+ * contention on the log tree. We can only rely on the value of
+ * last_dir_index_offset when we know for sure that the inode was
+ * previously logged in the current transaction.
*/
if (key.offset > inode->last_dir_index_offset)
goto add_to_batch;
@@ -3837,7 +3971,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log = root->log_root;
int err = 0;
int ret;
- u64 first_offset = min_offset;
+ u64 last_old_dentry_offset = min_offset - 1;
u64 last_offset = (u64)-1;
u64 ino = btrfs_ino(inode);
@@ -3871,10 +4005,11 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp,
path->slots[0]);
if (tmp.type == BTRFS_DIR_INDEX_KEY)
- first_offset = max(min_offset, tmp.offset) + 1;
+ last_old_dentry_offset = tmp.offset;
}
goto done;
}
@@ -3883,17 +4018,18 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
if (ret == 0) {
struct btrfs_key tmp;
+
btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
- if (tmp.type == BTRFS_DIR_INDEX_KEY) {
- first_offset = tmp.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &tmp);
- if (ret) {
- err = ret;
- goto done;
- }
- }
+ /*
+ * The dir index key before the first one we found that needs to
+ * be logged might be in a previous leaf, and there might be a
+ * gap between these keys, meaning that we had deletions that
+ * happened. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
+ * previous key's offset plus 1, so that those deletes are replayed.
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
}
btrfs_release_path(path);
@@ -3915,7 +4051,8 @@ search:
* from our directory
*/
while (1) {
- ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx);
+ ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
+ &last_old_dentry_offset);
if (ret != 0) {
if (ret < 0)
err = ret;
@@ -3941,14 +4078,16 @@ search:
goto done;
}
if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
- ctx->last_dir_item_offset = min_key.offset;
- ret = overwrite_item(trans, log, dst_path,
- path->nodes[0], path->slots[0],
- &min_key);
- if (ret)
- err = ret;
- else
- last_offset = min_key.offset;
+ /*
+ * The next leaf was not changed in the current transaction
+ * and has at least one dir index key.
+ * We check for the next key because there might have been
+ * one or more deletions between the last key we logged and
+ * that next key. So the key range item we log (key type
+ * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
+ * offset minus 1, so that those deletes are replayed.
+ */
+ last_offset = min_key.offset - 1;
goto done;
}
if (need_resched()) {
@@ -3964,13 +4103,21 @@ done:
if (err == 0) {
*last_offset_ret = last_offset;
/*
- * insert the log range keys to indicate where the log
- * is valid
+ * In case the leaf was changed in the current transaction but
+ * all its dir items are from a past transaction, the last item
+ * in the leaf is a dir item and there's no gap between that last
+ * dir item and the first one on the next leaf (which did not
+ * change in the current transaction), then we don't need to log
+ * a range, last_old_dentry_offset is == to last_offset.
*/
- ret = insert_dir_log_key(trans, log, path, ino, first_offset,
- last_offset);
- if (ret)
- err = ret;
+ ASSERT(last_old_dentry_offset <= last_offset);
+ if (last_old_dentry_offset < last_offset) {
+ ret = insert_dir_log_key(trans, log, path, ino,
+ last_old_dentry_offset + 1,
+ last_offset);
+ if (ret)
+ err = ret;
+ }
}
return err;
}
@@ -3997,22 +4144,7 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
u64 max_key;
int ret;
- /*
- * If this is the first time we are being logged in the current
- * transaction, or we were logged before but the inode was evicted and
- * reloaded later, in which case its logged_trans is 0, reset the value
- * of the last logged key offset. Note that we don't use the helper
- * function inode_logged() here - that is because the function returns
- * true after an inode eviction, assuming the worst case as it can not
- * know for sure if the inode was logged before. So we can not skip key
- * searches in the case the inode was evicted, because it may not have
- * been logged in this transaction and may have been logged in a past
- * transaction, so we need to reset the last dir index offset to (u64)-1.
- */
- if (inode->logged_trans != trans->transid)
- inode->last_dir_index_offset = (u64)-1;
-
- min_key = 0;
+ min_key = BTRFS_DIR_START_INDEX;
max_key = 0;
ctx->last_dir_item_offset = inode->last_dir_index_offset;
@@ -4048,9 +4180,6 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_key found_key;
int start_slot;
- if (!inode_logged(trans, inode))
- return 0;
-
key.objectid = btrfs_ino(inode);
key.type = max_key_type;
key.offset = (u64)-1;
@@ -4270,23 +4399,18 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- unsigned long src_offset;
- unsigned long dst_offset;
struct btrfs_root *log = inode->root->log_root;
struct btrfs_file_extent_item *extent;
- struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
- int ret;
+ int ret = 0;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
struct btrfs_item_batch batch;
char *ins_data;
int i;
- struct list_head ordered_sums;
- int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
-
- INIT_LIST_HEAD(&ordered_sums);
+ int dst_index;
+ const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
+ const u64 i_size = i_size_read(&inode->vfs_inode);
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
nr * sizeof(u32), GFP_NOFS);
@@ -4298,28 +4422,152 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
batch.keys = ins_keys;
batch.data_sizes = ins_sizes;
batch.total_data_size = 0;
- batch.nr = nr;
+ batch.nr = 0;
+ dst_index = 0;
for (i = 0; i < nr; i++) {
- ins_sizes[i] = btrfs_item_size(src, i + start_slot);
- batch.total_data_size += ins_sizes[i];
- btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
+ const int src_slot = start_slot + i;
+ struct btrfs_root *csum_root;
+ struct btrfs_ordered_sum *sums;
+ struct btrfs_ordered_sum *sums_next;
+ LIST_HEAD(ordered_sums);
+ u64 disk_bytenr;
+ u64 disk_num_bytes;
+ u64 extent_offset;
+ u64 extent_num_bytes;
+ bool is_old_extent;
+
+ btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
+
+ if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
+ goto add_to_batch;
+
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
+
+ is_old_extent = (btrfs_file_extent_generation(src, extent) <
+ trans->transid);
+
+ /*
+ * Don't copy extents from past generations. That would make us
+ * log a lot more metadata for common cases like doing only a
+ * few random writes into a file and then fsync it for the first
+ * time or after the full sync flag is set on the inode. We can
+ * get leaves full of extent items, most of which are from past
+ * generations, so we can skip them - as long as the inode has
+ * not been the target of a reflink operation in this transaction,
+ * as in that case it might have had file extent items with old
+ * generations copied into it. We also must always log prealloc
+ * extents that start at or beyond eof, otherwise we would lose
+ * them on log replay.
+ */
+ if (is_old_extent &&
+ ins_keys[dst_index].offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+ if (skip_csum)
+ goto add_to_batch;
+
+ /* Only regular extents have checksums. */
+ if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
+ goto add_to_batch;
+
+ /*
+ * If it's an extent created in a past transaction, then its
+ * checksums are already accessible from the committed csum tree,
+ * no need to log them.
+ */
+ if (is_old_extent)
+ goto add_to_batch;
+
+ disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
+ /* If it's an explicit hole, there are no checksums. */
+ if (disk_bytenr == 0)
+ goto add_to_batch;
+
+ disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
+
+ if (btrfs_file_extent_compression(src, extent)) {
+ extent_offset = 0;
+ extent_num_bytes = disk_num_bytes;
+ } else {
+ extent_offset = btrfs_file_extent_offset(src, extent);
+ extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
+ }
+
+ csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
+ disk_bytenr += extent_offset;
+ ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
+ disk_bytenr + extent_num_bytes - 1,
+ &ordered_sums, 0);
+ if (ret)
+ goto out;
+
+ list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
+ if (!ret)
+ ret = log_csums(trans, inode, log, sums);
+ list_del(&sums->list);
+ kfree(sums);
+ }
+ if (ret)
+ goto out;
+
+add_to_batch:
+ ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
+ batch.total_data_size += ins_sizes[dst_index];
+ batch.nr++;
+ dst_index++;
}
+
+ /*
+ * We have a leaf full of old extent items that don't need to be logged,
+ * so we don't need to do anything.
+ */
+ if (batch.nr == 0)
+ goto out;
+
ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
- if (ret) {
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ goto out;
+
+ dst_index = 0;
+ for (i = 0; i < nr; i++) {
+ const int src_slot = start_slot + i;
+ const int dst_slot = dst_path->slots[0] + dst_index;
+ struct btrfs_key key;
+ unsigned long src_offset;
+ unsigned long dst_offset;
+
+ /*
+ * We're done, all the remaining items in the source leaf
+ * correspond to old file extent items.
+ */
+ if (dst_index >= batch.nr)
+ break;
+
+ btrfs_item_key_to_cpu(src, &key, src_slot);
- for (i = 0; i < nr; i++, dst_path->slots[0]++) {
- dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
- dst_path->slots[0]);
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto copy_item;
- src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+ extent = btrfs_item_ptr(src, src_slot,
+ struct btrfs_file_extent_item);
- if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
- inode_item = btrfs_item_ptr(dst_path->nodes[0],
- dst_path->slots[0],
+ /* See the comment in the previous loop, same logic. */
+ if (btrfs_file_extent_generation(src, extent) < trans->transid &&
+ key.offset < i_size &&
+ inode->last_reflink_trans < trans->transid)
+ continue;
+
+copy_item:
+ dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
+ src_offset = btrfs_item_ptr_offset(src, src_slot);
+
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ struct btrfs_inode_item *inode_item;
+
+ inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
&inode->vfs_inode,
@@ -4327,71 +4575,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
- src_offset, ins_sizes[i]);
+ src_offset, ins_sizes[dst_index]);
}
- /* take a reference on file data extents so that truncates
- * or deletes of this inode don't have to relog the inode
- * again
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
- !skip_csum) {
- int found_type;
- extent = btrfs_item_ptr(src, start_slot + i,
- struct btrfs_file_extent_item);
-
- if (btrfs_file_extent_generation(src, extent) < trans->transid)
- continue;
-
- found_type = btrfs_file_extent_type(src, extent);
- if (found_type == BTRFS_FILE_EXTENT_REG) {
- struct btrfs_root *csum_root;
- u64 ds, dl, cs, cl;
- ds = btrfs_file_extent_disk_bytenr(src,
- extent);
- /* ds == 0 is a hole */
- if (ds == 0)
- continue;
-
- dl = btrfs_file_extent_disk_num_bytes(src,
- extent);
- cs = btrfs_file_extent_offset(src, extent);
- cl = btrfs_file_extent_num_bytes(src,
- extent);
- if (btrfs_file_extent_compression(src,
- extent)) {
- cs = 0;
- cl = dl;
- }
-
- csum_root = btrfs_csum_root(fs_info, ds);
- ret = btrfs_lookup_csums_range(csum_root,
- ds + cs, ds + cs + cl - 1,
- &ordered_sums, 0);
- if (ret)
- break;
- }
- }
+ dst_index++;
}
btrfs_mark_buffer_dirty(dst_path->nodes[0]);
btrfs_release_path(dst_path);
+out:
kfree(ins_data);
- /*
- * we have to do this after the loop above to avoid changing the
- * log tree while trying to change the log tree.
- */
- while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
- if (!ret)
- ret = log_csums(trans, inode, log, sums);
- list_del(&sums->list);
- kfree(sums);
- }
-
return ret;
}
@@ -4527,14 +4721,34 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_drop_extents_args drop_args = { 0 };
struct btrfs_root *log = inode->root->log_root;
- struct btrfs_file_extent_item *fi;
+ struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
- struct btrfs_map_token token;
struct btrfs_key key;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
+ btrfs_set_stack_file_extent_generation(&fi, trans->transid);
+ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
+ else
+ btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
+
+ block_len = max(em->block_len, em->orig_block_len);
+ if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+ btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
+ extent_offset);
+ btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
+ }
+
+ btrfs_set_stack_file_extent_offset(&fi, extent_offset);
+ btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
+ btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
+ btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
@@ -4548,12 +4762,12 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
* are small, with a root at level 2 or 3 at most, due to their short
* life span.
*/
- if (inode_logged(trans, inode)) {
+ if (ctx->logged_before) {
drop_args.path = path;
drop_args.start = em->start;
drop_args.end = em->start + em->len;
drop_args.replace_extent = true;
- drop_args.extent_item_size = sizeof(*fi);
+ drop_args.extent_item_size = sizeof(fi);
ret = btrfs_drop_extents(trans, log, inode, &drop_args);
if (ret)
return ret;
@@ -4565,44 +4779,14 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
key.offset = em->start;
ret = btrfs_insert_empty_item(trans, log, path, &key,
- sizeof(*fi));
+ sizeof(fi));
if (ret)
return ret;
}
leaf = path->nodes[0];
- btrfs_init_map_token(&token, leaf);
- fi = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
-
- btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_PREALLOC);
- else
- btrfs_set_token_file_extent_type(&token, fi,
- BTRFS_FILE_EXTENT_REG);
-
- block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi,
- em->block_start -
- extent_offset);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
- } else {
- btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
- btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
- }
-
- btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
- btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
- btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
- btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
- btrfs_set_token_file_extent_encryption(&token, fi, 0);
- btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
+ write_extent_buffer(leaf, &fi,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(fi));
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
@@ -4612,7 +4796,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -4816,7 +5000,6 @@ process:
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
- btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
if (ret)
@@ -5391,6 +5574,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -5411,13 +5595,21 @@ again:
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5448,10 +5640,8 @@ again:
btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
@@ -5503,10 +5693,29 @@ next_key:
} else {
break;
}
+
+ /*
+ * We may process many leaves full of items for our inode, so
+ * avoid monopolizing a cpu for too long by rescheduling while
+ * not holding locks on any tree.
+ */
+ cond_resched();
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
return ret;
}
@@ -5535,8 +5744,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = inode->root->log_root;
- int err = 0;
- int ret = 0;
+ int ret;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -5545,6 +5753,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool xattrs_logged = false;
bool recursive_logging = false;
bool inode_item_dropped = true;
+ const bool orig_logged_before = ctx->logged_before;
path = btrfs_alloc_path();
if (!path)
@@ -5578,8 +5787,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* and figure out which index ranges have to be logged.
*/
if (S_ISDIR(inode->vfs_inode.i_mode)) {
- err = btrfs_commit_inode_delayed_items(trans, inode);
- if (err)
+ ret = btrfs_commit_inode_delayed_items(trans, inode);
+ if (ret)
goto out;
}
@@ -5595,6 +5804,17 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * Before logging the inode item, cache the value returned by
+ * inode_logged(), because after that we have the need to figure out if
+ * the inode was previously logged in this transaction.
+ */
+ ret = inode_logged(trans, inode, path);
+ if (ret < 0)
+ goto out_unlock;
+ ctx->logged_before = (ret == 1);
+ ret = 0;
+
+ /*
* This is for cases where logging a directory could result in losing a
* a file after replaying the log. For example, if we move a file from a
* directory A to a directory B, then fsync directory A, we have no way
@@ -5605,7 +5825,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
inode_only == LOG_INODE_ALL &&
inode->last_unlink_trans >= trans->transid) {
btrfs_set_log_full_commit(trans);
- err = 1;
+ ret = 1;
goto out_unlock;
}
@@ -5619,9 +5839,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
if (inode_only == LOG_INODE_EXISTS)
max_key_type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_inode_items(trans, log, path, inode, max_key_type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key_type);
} else {
- if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) {
+ if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
/*
* Make sure the new inode item we write to the log has
* the same isize as the current one (if it exists).
@@ -5635,22 +5857,23 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
* (zeroes), as if an expanding truncate happened,
* instead of getting a file of 4Kb only.
*/
- err = logged_inode_size(log, inode, path, &logged_isize);
- if (err)
+ ret = logged_inode_size(log, inode, path, &logged_isize);
+ if (ret)
goto out_unlock;
}
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags)) {
if (inode_only == LOG_INODE_EXISTS) {
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_inode_items(trans, log, path, inode,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path,
+ inode, max_key.type);
} else {
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&inode->runtime_flags);
clear_bit(BTRFS_INODE_COPY_EVERYTHING,
&inode->runtime_flags);
- if (inode_logged(trans, inode))
+ if (ctx->logged_before)
ret = truncate_inode_items(trans, log,
inode, 0, 0);
}
@@ -5660,8 +5883,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
- ret = drop_inode_items(trans, log, path, inode,
- max_key.type);
+ if (ctx->logged_before)
+ ret = drop_inode_items(trans, log, path, inode,
+ max_key.type);
} else {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
@@ -5670,37 +5894,35 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
}
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
- err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
recursive_logging, inode_only, ctx,
&need_log_inode_item);
- if (err)
+ if (ret)
goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
goto out_unlock;
xattrs_logged = true;
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_holes(trans, inode, path);
- if (err)
+ ret = btrfs_log_holes(trans, inode, path);
+ if (ret)
goto out_unlock;
}
log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (need_log_inode_item) {
- err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
- if (err)
+ ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
+ if (ret)
goto out_unlock;
/*
* If we are doing a fast fsync and the inode was logged before
@@ -5711,18 +5933,16 @@ log_extents:
* BTRFS_INODE_COPY_EVERYTHING set.
*/
if (!xattrs_logged && inode->logged_trans < trans->transid) {
- err = btrfs_log_all_xattrs(trans, inode, path, dst_path);
- if (err)
+ ret = btrfs_log_all_xattrs(trans, inode, path, dst_path);
+ if (ret)
goto out_unlock;
btrfs_release_path(path);
}
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
} else if (inode_only == LOG_INODE_ALL) {
struct extent_map *em, *n;
@@ -5734,10 +5954,8 @@ log_extents:
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
ret = log_directory_changes(trans, inode, path, dst_path, ctx);
- if (ret) {
- err = ret;
+ if (ret)
goto out_unlock;
- }
}
spin_lock(&inode->lock);
@@ -5776,12 +5994,24 @@ log_extents:
if (inode_only != LOG_INODE_EXISTS)
inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
+
+ /*
+ * Reset the last_reflink_trans so that the next fsync does not need to
+ * go through the slower path when logging extents and their checksums.
+ */
+ if (inode_only == LOG_INODE_ALL)
+ inode->last_reflink_trans = 0;
+
out_unlock:
mutex_unlock(&inode->log_mutex);
out:
btrfs_free_path(path);
btrfs_free_path(dst_path);
- return err;
+
+ if (recursive_logging)
+ ctx->logged_before = orig_logged_before;
+
+ return ret;
}
/*
@@ -5866,7 +6096,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_root *log = root->log_root;
struct btrfs_path *path;
LIST_HEAD(dir_list);
struct btrfs_dir_list *dir_elem;
@@ -5908,7 +6137,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
min_key.offset = 0;
again:
btrfs_release_path(path);
- ret = btrfs_search_forward(log, &min_key, path, trans->transid);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
if (ret < 0) {
goto next_dir_inode;
} else if (ret > 0) {
@@ -5916,7 +6145,6 @@ again:
goto next_dir_inode;
}
-process_leaf:
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
for (i = path->slots[0]; i < nritems; i++) {
@@ -5934,8 +6162,7 @@ process_leaf:
di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
type = btrfs_dir_type(leaf, di);
- if (btrfs_dir_transid(leaf, di) < trans->transid &&
- type != BTRFS_FT_DIR)
+ if (btrfs_dir_transid(leaf, di) < trans->transid)
continue;
btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
if (di_key.type == BTRFS_ROOT_ITEM_KEY)
@@ -5973,16 +6200,6 @@ process_leaf:
}
break;
}
- if (i == nritems) {
- ret = btrfs_next_leaf(log, path);
- if (ret < 0) {
- goto next_dir_inode;
- } else if (ret > 0) {
- ret = 0;
- goto next_dir_inode;
- }
- goto process_leaf;
- }
if (min_key.offset < (u64)-1) {
min_key.offset++;
goto again;
@@ -6713,15 +6930,32 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
mutex_unlock(&dir->log_mutex);
}
-/*
- * Call this after adding a new name for a file and it will properly
- * update the log to reflect the new name.
+/**
+ * Update the log after adding a new name for an inode.
+ *
+ * @trans: Transaction handle.
+ * @old_dentry: The dentry associated with the old name and the old
+ * parent directory.
+ * @old_dir: The inode of the previous parent directory for the case
+ * of a rename. For a link operation, it must be NULL.
+ * @old_dir_index: The index number associated with the old name, meaningful
+ * only for rename operations (when @old_dir is not NULL).
+ * Ignored for link operations.
+ * @parent: The dentry associated with the directory under which the
+ * new name is located.
+ *
+ * Call this after adding a new name for an inode, as a result of a link or
+ * rename operation, and it will properly update the log to reflect the new name.
*/
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent)
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent)
{
+ struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
+ struct btrfs_root *root = inode->root;
struct btrfs_log_ctx ctx;
+ bool log_pinned = false;
+ int ret;
/*
* this will force the logging code to walk the dentry chain
@@ -6734,26 +6968,83 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (!inode_logged(trans, inode) &&
- (!old_dir || !inode_logged(trans, old_dir)))
- return;
+ ret = inode_logged(trans, inode, NULL);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
+ if (!old_dir)
+ return;
+ /*
+ * If the inode was not logged and we are doing a rename (old_dir is not
+ * NULL), check if old_dir was logged - if it was not we can return and
+ * do nothing.
+ */
+ ret = inode_logged(trans, old_dir, NULL);
+ if (ret < 0)
+ goto out;
+ else if (ret == 0)
+ return;
+ }
+ ret = 0;
/*
* If we are doing a rename (old_dir is not NULL) from a directory that
- * was previously logged, make sure the next log attempt on the directory
- * is not skipped and logs the inode again. This is because the log may
- * not currently be authoritative for a range including the old
- * BTRFS_DIR_INDEX_KEY key, so we want to make sure after a log replay we
- * do not end up with both the new and old dentries around (in case the
- * inode is a directory we would have a directory with two hard links and
- * 2 inode references for different parents). The next log attempt of
- * old_dir will happen at btrfs_log_all_parents(), called through
- * btrfs_log_inode_parent() below, because we have previously set
- * inode->last_unlink_trans to the current transaction ID, either here or
- * at btrfs_record_unlink_dir() in case the inode is a directory.
+ * was previously logged, make sure that on log replay we get the old
+ * dir entry deleted. This is needed because we will also log the new
+ * name of the renamed inode, so we need to make sure that after log
+ * replay we don't end up with both the new and old dir entries existing.
*/
- if (old_dir)
- old_dir->logged_trans = 0;
+ if (old_dir && old_dir->logged_trans == trans->transid) {
+ struct btrfs_root *log = old_dir->root->log_root;
+ struct btrfs_path *path;
+
+ ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
+
+ /*
+ * We have two inodes to update in the log, the old directory and
+ * the inode that got renamed, so we must pin the log to prevent
+ * anyone from syncing the log until we have updated both inodes
+ * in the log.
+ */
+ log_pinned = true;
+ btrfs_pin_log_trans(root);
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Other concurrent task might be logging the old directory,
+ * as it can be triggered when logging other inode that had or
+ * still has a dentry in the old directory. So take the old
+ * directory's log_mutex to prevent getting an -EEXIST when
+ * logging a key to record the deletion, or having that other
+ * task logging the old directory get an -EEXIST if it attempts
+ * to log the same key after we just did it. In both cases that
+ * would result in falling back to a transaction commit.
+ */
+ mutex_lock(&old_dir->log_mutex);
+ ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
+ old_dentry->d_name.name,
+ old_dentry->d_name.len, old_dir_index);
+ if (ret > 0) {
+ /*
+ * The dentry does not exist in the log, so record its
+ * deletion.
+ */
+ btrfs_release_path(path);
+ ret = insert_dir_log_key(trans, log, path,
+ btrfs_ino(old_dir),
+ old_dir_index, old_dir_index);
+ }
+ mutex_unlock(&old_dir->log_mutex);
+
+ btrfs_free_path(path);
+ if (ret < 0)
+ goto out;
+ }
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
@@ -6765,5 +7056,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
+out:
+ /*
+ * If an error happened mark the log for a full commit because it's not
+ * consistent and up to date or we couldn't find out if one of the
+ * inodes was logged before in this transaction. Do it before unpinning
+ * the log, to avoid any races with someone else trying to commit it.
+ */
+ if (ret < 0)
+ btrfs_set_log_full_commit(trans);
+ if (log_pinned)
+ btrfs_end_log_trans(root);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index f6811c3df38a..1620f8170629 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -17,6 +17,8 @@ struct btrfs_log_ctx {
int log_transid;
bool log_new_dentries;
bool logging_new_name;
+ /* Indicate if the inode being logged was logged before. */
+ bool logged_before;
/* Tracks the last logged dir item/index key offset. */
u64 last_dir_item_offset;
struct inode *inode;
@@ -32,6 +34,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_transid = 0;
ctx->log_new_dentries = false;
ctx->logging_new_name = false;
+ ctx->logged_before = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
INIT_LIST_HEAD(&ctx->ordered_extents);
@@ -86,7 +89,7 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
void btrfs_log_new_name(struct btrfs_trans_handle *trans,
- struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent);
+ struct dentry *old_dentry, struct btrfs_inode *old_dir,
+ u64 old_dir_index, struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b07d382d53a8..1be7cb2f955f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -534,30 +534,20 @@ error:
return ret;
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
-{
- int found;
-
- rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
- rcu_read_unlock();
-
- return found == 0;
-}
-
-/*
- * Search and remove all stale (devices which are not mounted) devices.
+/**
+ * Search and remove all stale devices (which are not mounted).
* When both inputs are NULL, it will search and release all stale devices.
- * path: Optional. When provided will it release all unmounted devices
- * matching this path only.
- * skip_dev: Optional. Will skip this device when searching for the stale
+ *
+ * @devt: Optional. When provided will it release all unmounted devices
+ * matching this devt only.
+ * @skip_device: Optional. Will skip this device when searching for the stale
* devices.
- * Return: 0 for success or if @path is NULL.
- * -EBUSY if @path is a mounted device.
- * -ENOENT if @path does not match any device in the list.
+ *
+ * Return: 0 for success or if @devt is 0.
+ * -EBUSY if @devt is a mounted device.
+ * -ENOENT if @devt does not match any device in the list.
*/
-static int btrfs_free_stale_devices(const char *path,
- struct btrfs_device *skip_device)
+static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
{
struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
struct btrfs_device *device, *tmp_device;
@@ -565,7 +555,7 @@ static int btrfs_free_stale_devices(const char *path,
lockdep_assert_held(&uuid_mutex);
- if (path)
+ if (devt)
ret = -ENOENT;
list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
@@ -575,13 +565,11 @@ static int btrfs_free_stale_devices(const char *path,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (devt && devt != device->devt)
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
- if (path && ret != 0)
+ if (devt && ret != 0)
ret = -EBUSY;
break;
}
@@ -614,7 +602,6 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
{
- struct request_queue *q;
struct block_device *bdev;
struct btrfs_super_block *disk_super;
u64 devid;
@@ -656,8 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
- q = bdev_get_queue(bdev);
- if (!blk_queue_nonrot(q))
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = true;
device->bdev = bdev;
@@ -781,11 +767,17 @@ static noinline struct btrfs_device *device_list_add(const char *path,
struct rcu_string *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
+ dev_t path_devt;
+ int error;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+ error = lookup_bdev(path, &path_devt);
+ if (error)
+ return ERR_PTR(error);
+
if (fsid_change_in_progress) {
if (!has_metadata_uuid)
fs_devices = find_fsid_inprogress(disk_super);
@@ -868,6 +860,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-ENOMEM);
}
rcu_assign_pointer(device->name, name);
+ device->devt = path_devt;
list_add_rcu(&device->dev_list, &fs_devices->devices);
fs_devices->num_devices++;
@@ -928,25 +921,15 @@ static noinline struct btrfs_device *device_list_add(const char *path,
/*
* We are going to replace the device path for a given devid,
* make sure it's the same device if the device is mounted
+ *
+ * NOTE: the device->fs_info may not be reliable here so pass
+ * in a NULL to message helpers instead. This avoids a possible
+ * use-after-free when the fs_info and fs_info->sb are already
+ * torn down.
*/
if (device->bdev) {
- int error;
- dev_t path_dev;
-
- error = lookup_bdev(path, &path_dev);
- if (error) {
- mutex_unlock(&fs_devices->device_list_mutex);
- return ERR_PTR(error);
- }
-
- if (device->bdev->bd_dev != path_dev) {
+ if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- /*
- * device->fs_info may not be reliable here, so
- * pass in a NULL instead. This avoids a
- * possible use-after-free when the fs_info and
- * fs_info->sb are already torn down.
- */
btrfs_warn_in_rcu(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
@@ -954,7 +937,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(device->fs_info,
+ btrfs_info_in_rcu(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, rcu_str_deref(device->name),
path, current->comm,
@@ -972,6 +955,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
}
+ device->devt = path_devt;
}
/*
@@ -1331,12 +1315,12 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
return disk_super;
}
-int btrfs_forget_devices(const char *path)
+int btrfs_forget_devices(dev_t devt)
{
int ret;
mutex_lock(&uuid_mutex);
- ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
+ ret = btrfs_free_stale_devices(devt, NULL);
mutex_unlock(&uuid_mutex);
return ret;
@@ -1385,10 +1369,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
}
device = device_list_add(path, disk_super, &new_device_added);
- if (!IS_ERR(device)) {
- if (new_device_added)
- btrfs_free_stale_devices(path, device);
- }
+ if (!IS_ERR(device) && new_device_added)
+ btrfs_free_stale_devices(device->devt, device);
btrfs_release_disk_super(disk_super);
@@ -2102,6 +2084,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
u64 num_devices;
int ret = 0;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* The device list in fs_devices is accessed without locks (neither
* uuid_mutex nor device_list_mutex) as it won't change on a mounted
@@ -2606,7 +2593,6 @@ error:
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
{
struct btrfs_root *root = fs_info->dev_root;
- struct request_queue *q;
struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct block_device *bdev;
@@ -2668,6 +2654,9 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
device->fs_info = fs_info;
device->bdev = bdev;
+ ret = lookup_bdev(device_path, &device->devt);
+ if (ret)
+ goto error_free_device;
ret = btrfs_get_dev_zone_info(device, false);
if (ret)
@@ -2679,7 +2668,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_free_zone;
}
- q = bdev_get_queue(bdev);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = trans->transid;
device->io_width = fs_info->sectorsize;
@@ -2727,7 +2715,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
- if (!blk_queue_nonrot(q))
+ if (!blk_queue_nonrot(bdev_get_queue(bdev)))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
@@ -2814,7 +2802,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
* We can ignore the return value as it typically returns -EINVAL and
* only succeeds if the device was an alien.
*/
- btrfs_forget_devices(device_path);
+ btrfs_forget_devices(device->devt);
/* Update ctime/mtime for blkid or udev */
update_dev_time(device_path);
@@ -3251,6 +3239,12 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
u64 length;
int ret;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
+ btrfs_err(fs_info,
+ "relocate: not supported on extent tree v2 yet");
+ return -EINVAL;
+ }
+
/*
* Prevent races with automatic removal of unused block groups.
* After we relocate and before we remove the chunk with offset
@@ -7060,6 +7054,27 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
}
#endif
+static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
+ u64 devid, u8 *uuid)
+{
+ struct btrfs_device *dev;
+
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_report_missing_device(fs_info, devid, uuid, true);
+ return ERR_PTR(-ENOENT);
+ }
+
+ dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
+ if (IS_ERR(dev)) {
+ btrfs_err(fs_info, "failed to init missing device %llu: %ld",
+ devid, PTR_ERR(dev));
+ return dev;
+ }
+ btrfs_report_missing_device(fs_info, devid, uuid, false);
+
+ return dev;
+}
+
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
@@ -7147,28 +7162,17 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
BTRFS_UUID_SIZE);
args.uuid = uuid;
map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
- if (!map->stripes[i].dev &&
- !btrfs_test_opt(fs_info, DEGRADED)) {
- free_extent_map(em);
- btrfs_report_missing_device(fs_info, devid, uuid, true);
- return -ENOENT;
- }
if (!map->stripes[i].dev) {
- map->stripes[i].dev =
- add_missing_dev(fs_info->fs_devices, devid,
- uuid);
+ map->stripes[i].dev = handle_missing_device(fs_info,
+ devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
free_extent_map(em);
- btrfs_err(fs_info,
- "failed to init missing dev %llu: %ld",
- devid, PTR_ERR(map->stripes[i].dev));
return PTR_ERR(map->stripes[i].dev);
}
- btrfs_report_missing_device(fs_info, devid, uuid, false);
}
+
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&(map->stripes[i].dev->dev_state));
-
}
write_lock(&map_tree->lock);
@@ -8299,10 +8303,12 @@ static int relocating_repair_kthread(void *data)
target = cache->start;
btrfs_put_block_group(cache);
+ sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
btrfs_info(fs_info,
"zoned: skip relocating block group %llu to repair: EBUSY",
target);
+ sb_end_write(fs_info->sb);
return -EBUSY;
}
@@ -8330,6 +8336,7 @@ out:
btrfs_put_block_group(cache);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
+ sb_end_write(fs_info->sb);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 005c9e2a491a..bd297f23d19e 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -72,6 +72,11 @@ struct btrfs_device {
/* the mode sent to blkdev_get */
fmode_t mode;
+ /*
+ * Device's major-minor number. Must be set even if the device is not
+ * opened (bdev == NULL), unless the device is missing.
+ */
+ dev_t devt;
unsigned long dev_state;
blk_status_t last_flush_error;
@@ -505,7 +510,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
fmode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
-int btrfs_forget_devices(const char *path);
+int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
void btrfs_assign_next_active_device(struct btrfs_device *device,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f559d517c7c4..b7b5fac1c779 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -652,8 +652,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
if (model == BLK_ZONED_HM ||
(model == BLK_ZONED_HA && incompat_zoned) ||
(model == BLK_ZONED_NONE && incompat_zoned)) {
- struct btrfs_zoned_device_info *zone_info =
- device->zone_info;
+ struct btrfs_zoned_device_info *zone_info;
zone_info = device->zone_info;
zoned_devices++;
@@ -1215,12 +1214,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
struct btrfs_device *device;
u64 logical = cache->start;
u64 length = cache->length;
- u64 physical = 0;
int ret;
int i;
unsigned int nofs_flag;
u64 *alloc_offsets = NULL;
u64 *caps = NULL;
+ u64 *physical = NULL;
unsigned long *active = NULL;
u64 last_alloc = 0;
u32 num_sequential = 0, num_conventional = 0;
@@ -1264,6 +1263,12 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS);
+ if (!physical) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
if (!active) {
ret = -ENOMEM;
@@ -1277,14 +1282,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
int dev_replace_is_ongoing = 0;
device = map->stripes[i].dev;
- physical = map->stripes[i].physical;
+ physical[i] = map->stripes[i].physical;
if (device->bdev == NULL) {
alloc_offsets[i] = WP_MISSING_DEV;
continue;
}
- is_sequential = btrfs_dev_is_sequential(device, physical);
+ is_sequential = btrfs_dev_is_sequential(device, physical[i]);
if (is_sequential)
num_sequential++;
else
@@ -1299,21 +1304,21 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* This zone will be used for allocation, so mark this zone
* non-empty.
*/
- btrfs_dev_clear_zone_empty(device, physical);
+ btrfs_dev_clear_zone_empty(device, physical[i]);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
- btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
+ btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]);
up_read(&dev_replace->rwsem);
/*
* The group is mapped to a sequential zone. Get the zone write
* pointer to determine the allocation offset within the zone.
*/
- WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
+ WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size));
nofs_flag = memalloc_nofs_save();
- ret = btrfs_get_dev_zone(device, physical, &zone);
+ ret = btrfs_get_dev_zone(device, physical[i], &zone);
memalloc_nofs_restore(nofs_flag);
if (ret == -EIO || ret == -EOPNOTSUPP) {
ret = 0;
@@ -1339,7 +1344,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
case BLK_ZONE_COND_READONLY:
btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
- physical >> device->zone_info->zone_size_shift,
+ physical[i] >> device->zone_info->zone_size_shift,
rcu_str_deref(device->name), device->devid);
alloc_offsets[i] = WP_MISSING_DEV;
break;
@@ -1404,7 +1409,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
if (alloc_offsets[0] == WP_MISSING_DEV) {
btrfs_err(fs_info,
"zoned: cannot recover write pointer for zone %llu",
- physical);
+ physical[0]);
ret = -EIO;
goto out;
}
@@ -1413,6 +1418,42 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
cache->zone_is_active = test_bit(0, active);
break;
case BTRFS_BLOCK_GROUP_DUP:
+ if (map->type & BTRFS_BLOCK_GROUP_DATA) {
+ btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (alloc_offsets[0] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[0]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[1] == WP_MISSING_DEV) {
+ btrfs_err(fs_info,
+ "zoned: cannot recover write pointer for zone %llu",
+ physical[1]);
+ ret = -EIO;
+ goto out;
+ }
+ if (alloc_offsets[0] != alloc_offsets[1]) {
+ btrfs_err(fs_info,
+ "zoned: write pointer offset mismatch of zones in DUP profile");
+ ret = -EIO;
+ goto out;
+ }
+ if (test_bit(0, active) != test_bit(1, active)) {
+ if (!btrfs_zone_activate(cache)) {
+ ret = -EIO;
+ goto out;
+ }
+ } else {
+ cache->zone_is_active = test_bit(0, active);
+ }
+ cache->alloc_offset = alloc_offsets[0];
+ cache->zone_capacity = min(caps[0], caps[1]);
+ break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID0:
case BTRFS_BLOCK_GROUP_RAID10:
@@ -1465,6 +1506,7 @@ out:
cache->physical_map = NULL;
}
bitmap_free(active);
+ kfree(physical);
kfree(caps);
kfree(alloc_offsets);
free_extent_map(em);
@@ -1781,50 +1823,55 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
struct btrfs_device *device;
u64 physical;
bool ret;
+ int i;
if (!btrfs_is_zoned(block_group->fs_info))
return true;
map = block_group->physical_map;
- /* Currently support SINGLE profile only */
- ASSERT(map->num_stripes == 1);
- device = map->stripes[0].dev;
- physical = map->stripes[0].physical;
-
- if (device->zone_info->max_active_zones == 0)
- return true;
spin_lock(&block_group->lock);
-
if (block_group->zone_is_active) {
ret = true;
goto out_unlock;
}
- /* No space left */
- if (block_group->alloc_offset == block_group->zone_capacity) {
- ret = false;
- goto out_unlock;
- }
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
- if (!btrfs_dev_set_active_zone(device, physical)) {
- /* Cannot activate the zone */
- ret = false;
- goto out_unlock;
- }
+ if (device->zone_info->max_active_zones == 0)
+ continue;
+
+ /* No space left */
+ if (block_group->alloc_offset == block_group->zone_capacity) {
+ ret = false;
+ goto out_unlock;
+ }
+
+ if (!btrfs_dev_set_active_zone(device, physical)) {
+ /* Cannot activate the zone */
+ ret = false;
+ goto out_unlock;
+ }
+
+ /* Successfully activated all the zones */
+ if (i == map->num_stripes - 1)
+ block_group->zone_is_active = 1;
- /* Successfully activated all the zones */
- block_group->zone_is_active = 1;
+ }
spin_unlock(&block_group->lock);
- /* For the active block group list */
- btrfs_get_block_group(block_group);
+ if (block_group->zone_is_active) {
+ /* For the active block group list */
+ btrfs_get_block_group(block_group);
- spin_lock(&fs_info->zone_active_bgs_lock);
- ASSERT(list_empty(&block_group->active_bg_list));
- list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
- spin_unlock(&fs_info->zone_active_bgs_lock);
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_add_tail(&block_group->active_bg_list,
+ &fs_info->zone_active_bgs);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
return true;
@@ -1840,19 +1887,12 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
struct btrfs_device *device;
u64 physical;
int ret = 0;
+ int i;
if (!btrfs_is_zoned(fs_info))
return 0;
map = block_group->physical_map;
- /* Currently support SINGLE profile only */
- ASSERT(map->num_stripes == 1);
-
- device = map->stripes[0].dev;
- physical = map->stripes[0].physical;
-
- if (device->zone_info->max_active_zones == 0)
- return 0;
spin_lock(&block_group->lock);
if (!block_group->zone_is_active) {
@@ -1904,25 +1944,34 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- device->zone_info->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
- btrfs_dec_block_group_ro(block_group);
+ for (i = 0; i < map->num_stripes; i++) {
+ device = map->stripes[i].dev;
+ physical = map->stripes[i].physical;
- if (!ret) {
- btrfs_dev_clear_active_zone(device, physical);
+ if (device->zone_info->max_active_zones == 0)
+ continue;
- spin_lock(&fs_info->zone_active_bgs_lock);
- ASSERT(!list_empty(&block_group->active_bg_list));
- list_del_init(&block_group->active_bg_list);
- spin_unlock(&fs_info->zone_active_bgs_lock);
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ device->zone_info->zone_size >> SECTOR_SHIFT,
+ GFP_NOFS);
- /* For active_bg_list */
- btrfs_put_block_group(block_group);
+ if (ret)
+ return ret;
+
+ btrfs_dev_clear_active_zone(device, physical);
}
+ btrfs_dec_block_group_ro(block_group);
- return ret;
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ ASSERT(!list_empty(&block_group->active_bg_list));
+ list_del_init(&block_group->active_bg_list);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+
+ /* For active_bg_list */
+ btrfs_put_block_group(block_group);
+
+ return 0;
}
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
diff --git a/fs/buffer.c b/fs/buffer.c
index a17c386a142c..d67fbe063a3a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -53,7 +53,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- enum rw_hint hint, struct writeback_control *wbc);
+ struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -613,17 +613,14 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* FIXME: may need to call ->reservepage here as well. That's rather up to the
* address_space though.
*/
-int __set_page_dirty_buffers(struct page *page)
+bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- int newly_dirty;
- struct address_space *mapping = page_mapping(page);
-
- if (unlikely(!mapping))
- return !TestSetPageDirty(page);
+ struct buffer_head *head;
+ bool newly_dirty;
spin_lock(&mapping->private_lock);
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
+ head = folio_buffers(folio);
+ if (head) {
struct buffer_head *bh = head;
do {
@@ -635,21 +632,21 @@ int __set_page_dirty_buffers(struct page *page)
* Lock out page's memcg migration to keep PageDirty
* synchronized with per-memcg dirty page counters.
*/
- lock_page_memcg(page);
- newly_dirty = !TestSetPageDirty(page);
+ folio_memcg_lock(folio);
+ newly_dirty = !folio_test_set_dirty(folio);
spin_unlock(&mapping->private_lock);
if (newly_dirty)
- __set_page_dirty(page, mapping, 1);
+ __folio_mark_dirty(folio, mapping, 1);
- unlock_page_memcg(page);
+ folio_memcg_unlock(folio);
if (newly_dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return newly_dirty;
}
-EXPORT_SYMBOL(__set_page_dirty_buffers);
+EXPORT_SYMBOL(block_dirty_folio);
/*
* Write out and wait upon a list of buffers.
@@ -1235,16 +1232,18 @@ static void bh_lru_install(struct buffer_head *bh)
int i;
check_irqs_on();
+ bh_lru_lock();
+
/*
* the refcount of buffer_head in bh_lru prevents dropping the
* attached page(i.e., try_to_free_buffers) so it could cause
* failing page migration.
* Skip putting upcoming bh into bh_lru until migration is done.
*/
- if (lru_cache_disabled())
+ if (lru_cache_disabled()) {
+ bh_lru_unlock();
return;
-
- bh_lru_lock();
+ }
b = this_cpu_ptr(&bh_lrus);
for (i = 0; i < BH_LRU_SIZE; i++) {
@@ -1482,41 +1481,40 @@ static void discard_buffer(struct buffer_head * bh)
}
/**
- * block_invalidatepage - invalidate part or all of a buffer-backed page
- *
- * @page: the page which is affected
+ * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
+ * @folio: The folio which is affected.
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
- * block_invalidatepage() is called when all or part of the page has become
+ * block_invalidate_folio() is called when all or part of the folio has been
* invalidated by a truncate operation.
*
- * block_invalidatepage() does not have to release all buffers, but it must
+ * block_invalidate_folio() does not have to release all buffers, but it must
* ensure that no dirty buffer is left outside @offset and that no I/O
* is underway against any of the blocks which are outside the truncation
* point. Because the caller is about to free (and possibly reuse) those
* blocks on-disk.
*/
-void block_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
struct buffer_head *head, *bh, *next;
- unsigned int curr_off = 0;
- unsigned int stop = length + offset;
+ size_t curr_off = 0;
+ size_t stop = length + offset;
- BUG_ON(!PageLocked(page));
- if (!page_has_buffers(page))
- goto out;
+ BUG_ON(!folio_test_locked(folio));
/*
* Check for overflow
*/
- BUG_ON(stop > PAGE_SIZE || stop < length);
+ BUG_ON(stop > folio_size(folio) || stop < length);
+
+ head = folio_buffers(folio);
+ if (!head)
+ return;
- head = page_buffers(page);
bh = head;
do {
- unsigned int next_off = curr_off + bh->b_size;
+ size_t next_off = curr_off + bh->b_size;
next = bh->b_this_page;
/*
@@ -1535,21 +1533,21 @@ void block_invalidatepage(struct page *page, unsigned int offset,
} while (bh != head);
/*
- * We release buffers only if the entire page is being invalidated.
+ * We release buffers only if the entire folio is being invalidated.
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (length == PAGE_SIZE)
- try_to_release_page(page, 0);
+ if (length == folio_size(folio))
+ filemap_release_folio(folio, 0);
out:
return;
}
-EXPORT_SYMBOL(block_invalidatepage);
+EXPORT_SYMBOL(block_invalidate_folio);
/*
* We attach and possibly dirty the buffers atomically wrt
- * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
+ * block_dirty_folio() via private_lock. try_to_free_buffers
* is already excluded via the page lock.
*/
void create_empty_buffers(struct page *page,
@@ -1724,12 +1722,12 @@ int __block_write_full_page(struct inode *inode, struct page *page,
(1 << BH_Dirty)|(1 << BH_Uptodate));
/*
- * Be very careful. We have no exclusion from __set_page_dirty_buffers
+ * Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
- * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+ * Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
@@ -1806,8 +1804,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
- inode->i_write_hint, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -1861,8 +1858,7 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE, write_flags, bh,
- inode->i_write_hint, wbc);
+ submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc);
nr_underway++;
}
bh = next;
@@ -2206,29 +2202,27 @@ int generic_write_end(struct file *file, struct address_space *mapping,
EXPORT_SYMBOL(generic_write_end);
/*
- * block_is_partially_uptodate checks whether buffers within a page are
+ * block_is_partially_uptodate checks whether buffers within a folio are
* uptodate or not.
*
- * Returns true if all buffers which correspond to a file portion
- * we want to read are uptodate.
+ * Returns true if all buffers which correspond to the specified part
+ * of the folio are uptodate.
*/
-int block_is_partially_uptodate(struct page *page, unsigned long from,
- unsigned long count)
+bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
unsigned block_start, block_end, blocksize;
unsigned to;
struct buffer_head *bh, *head;
- int ret = 1;
+ bool ret = true;
- if (!page_has_buffers(page))
- return 0;
-
- head = page_buffers(page);
+ head = folio_buffers(folio);
+ if (!head)
+ return false;
blocksize = head->b_size;
- to = min_t(unsigned, PAGE_SIZE - from, count);
+ to = min_t(unsigned, folio_size(folio) - from, count);
to = from + to;
- if (from < blocksize && to > PAGE_SIZE - blocksize)
- return 0;
+ if (from < blocksize && to > folio_size(folio) - blocksize)
+ return false;
bh = head;
block_start = 0;
@@ -2236,7 +2230,7 @@ int block_is_partially_uptodate(struct page *page, unsigned long from,
block_end = block_start + blocksize;
if (block_end > from && block_start < to) {
if (!buffer_uptodate(bh)) {
- ret = 0;
+ ret = false;
break;
}
if (block_end >= to)
@@ -3008,7 +3002,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
- enum rw_hint write_hint, struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
struct bio *bio;
@@ -3034,7 +3028,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_write_hint = write_hint;
bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
BUG_ON(bio->bi_iter.bi_size != bh->b_size);
@@ -3056,7 +3049,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
int submit_bh(int op, int op_flags, struct buffer_head *bh)
{
- return submit_bh_wbc(op, op_flags, bh, 0, NULL);
+ return submit_bh_wbc(op, op_flags, bh, NULL);
}
EXPORT_SYMBOL(submit_bh);
@@ -3183,7 +3176,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
*
* The same applies to regular filesystem pages: if all the buffers are
* clean then we set the page clean and proceed. To do that, we require
- * total exclusion from __set_page_dirty_buffers(). That is obtained with
+ * total exclusion from block_dirty_folio(). That is obtained with
* private_lock.
*
* try_to_free_buffers() is non-blocking.
@@ -3250,7 +3243,7 @@ int try_to_free_buffers(struct page *page)
* the page also.
*
* private_lock must be held over this entire operation in order
- * to synchronise against __set_page_dirty_buffers and prevent the
+ * to synchronise against block_dirty_folio and prevent the
* dirty bit from being lost.
*/
if (ret)
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 51c968cd00a6..ae93cee9d25d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -254,7 +254,7 @@ static bool cachefiles_shorten_object(struct cachefiles_object *object,
ret = cachefiles_inject_write_error();
if (ret == 0)
ret = vfs_fallocate(file, FALLOC_FL_ZERO_RANGE,
- new_size, dio_size);
+ new_size, dio_size - new_size);
if (ret < 0) {
trace_cachefiles_io_error(object, file_inode(file), ret,
cachefiles_trace_fallocate_error);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 04eb52736990..bc7c7a7d9260 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -138,7 +138,6 @@ static int cachefiles_read(struct netfs_cache_resources *cres,
ki->iocb.ki_filp = file;
ki->iocb.ki_pos = start_pos + skipped;
ki->iocb.ki_flags = IOCB_DIRECT;
- ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
ki->iocb.ki_ioprio = get_current_ioprio();
ki->skipped = skipped;
ki->object = object;
@@ -192,6 +191,64 @@ presubmission_error:
}
/*
+ * Query the occupancy of the cache in a region, returning where the next chunk
+ * of data starts and how long it is.
+ */
+static int cachefiles_query_occupancy(struct netfs_cache_resources *cres,
+ loff_t start, size_t len, size_t granularity,
+ loff_t *_data_start, size_t *_data_len)
+{
+ struct cachefiles_object *object;
+ struct file *file;
+ loff_t off, off2;
+
+ *_data_start = -1;
+ *_data_len = 0;
+
+ if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ))
+ return -ENOBUFS;
+
+ object = cachefiles_cres_object(cres);
+ file = cachefiles_cres_file(cres);
+ granularity = max_t(size_t, object->volume->cache->bsize, granularity);
+
+ _enter("%pD,%li,%llx,%zx/%llx",
+ file, file_inode(file)->i_ino, start, len,
+ i_size_read(file_inode(file)));
+
+ off = cachefiles_inject_read_error();
+ if (off == 0)
+ off = vfs_llseek(file, start, SEEK_DATA);
+ if (off == -ENXIO)
+ return -ENODATA; /* Beyond EOF */
+ if (off < 0 && off >= (loff_t)-MAX_ERRNO)
+ return -ENOBUFS; /* Error. */
+ if (round_up(off, granularity) >= start + len)
+ return -ENODATA; /* No data in range */
+
+ off2 = cachefiles_inject_read_error();
+ if (off2 == 0)
+ off2 = vfs_llseek(file, off, SEEK_HOLE);
+ if (off2 == -ENXIO)
+ return -ENODATA; /* Beyond EOF */
+ if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO)
+ return -ENOBUFS; /* Error. */
+
+ /* Round away partial blocks */
+ off = round_up(off, granularity);
+ off2 = round_down(off2, granularity);
+ if (off2 <= off)
+ return -ENODATA;
+
+ *_data_start = off;
+ if (off2 > start + len)
+ *_data_len = len;
+ else
+ *_data_len = off2 - off;
+ return 0;
+}
+
+/*
* Handle completion of a write to the cache.
*/
static void cachefiles_write_complete(struct kiocb *iocb, long ret)
@@ -255,7 +312,6 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
ki->iocb.ki_filp = file;
ki->iocb.ki_pos = start_pos;
ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
- ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
ki->iocb.ki_ioprio = get_current_ioprio();
ki->object = object;
ki->inval_counter = cres->inval_counter;
@@ -545,6 +601,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.write = cachefiles_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
+ .query_occupancy = cachefiles_query_occupancy,
};
/*
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 83f41bd0c3a9..35465109d9c4 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -28,6 +28,11 @@ struct cachefiles_xattr {
static const char cachefiles_xattr_cache[] =
XATTR_USER_PREFIX "CacheFiles.cache";
+struct cachefiles_vol_xattr {
+ __be32 reserved; /* Reserved, should be 0 */
+ __u8 data[]; /* netfs volume coherency data */
+} __packed;
+
/*
* set the state xattr on a cache file
*/
@@ -185,6 +190,7 @@ void cachefiles_prepare_to_write(struct fscache_cookie *cookie)
*/
bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
{
+ struct cachefiles_vol_xattr *buf;
unsigned int len = volume->vcookie->coherency_len;
const void *p = volume->vcookie->coherency;
struct dentry *dentry = volume->dentry;
@@ -192,10 +198,17 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
_enter("%x,#%d", volume->vcookie->debug_id, len);
+ len += sizeof(*buf);
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return false;
+ buf->reserved = cpu_to_be32(0);
+ memcpy(buf->data, p, len);
+
ret = cachefiles_inject_write_error();
if (ret == 0)
ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache,
- p, len, 0);
+ buf, len, 0);
if (ret < 0) {
trace_cachefiles_vfs_error(NULL, d_inode(dentry), ret,
cachefiles_trace_setxattr_error);
@@ -209,6 +222,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
cachefiles_coherency_vol_set_ok);
}
+ kfree(buf);
_leave(" = %d", ret);
return ret == 0;
}
@@ -218,7 +232,7 @@ bool cachefiles_set_volume_xattr(struct cachefiles_volume *volume)
*/
int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
{
- struct cachefiles_xattr *buf;
+ struct cachefiles_vol_xattr *buf;
struct dentry *dentry = volume->dentry;
unsigned int len = volume->vcookie->coherency_len;
const void *p = volume->vcookie->coherency;
@@ -228,6 +242,7 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
_enter("");
+ len += sizeof(*buf);
buf = kmalloc(len, GFP_KERNEL);
if (!buf)
return -ENOMEM;
@@ -245,7 +260,9 @@ int cachefiles_check_volume_xattr(struct cachefiles_volume *volume)
"Failed to read xattr with error %zd", xlen);
}
why = cachefiles_coherency_vol_check_xattr;
- } else if (memcmp(buf->data, p, len) != 0) {
+ } else if (buf->reserved != cpu_to_be32(0)) {
+ why = cachefiles_coherency_vol_check_resv;
+ } else if (memcmp(buf->data, p, len - sizeof(*buf)) != 0) {
why = cachefiles_coherency_vol_check_cmp;
} else {
why = cachefiles_coherency_vol_check_ok;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c98e5238a1b6..c7a0ab0d298b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -76,18 +76,17 @@ static inline struct ceph_snap_context *page_snap_context(struct page *page)
* Dirty a page. Optimistically adjust accounting, on the assumption
* that we won't race with invalidate. If we do, readjust.
*/
-static int ceph_set_page_dirty(struct page *page)
+static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- if (PageDirty(page)) {
- dout("%p set_page_dirty %p idx %lu -- already dirty\n",
- mapping->host, page, page->index);
- BUG_ON(!PagePrivate(page));
- return 0;
+ if (folio_test_dirty(folio)) {
+ dout("%p dirty_folio %p idx %lu -- already dirty\n",
+ mapping->host, folio, folio->index);
+ BUG_ON(!folio_get_private(folio));
+ return false;
}
inode = mapping->host;
@@ -111,56 +110,56 @@ static int ceph_set_page_dirty(struct page *page)
if (ci->i_wrbuffer_ref == 0)
ihold(inode);
++ci->i_wrbuffer_ref;
- dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ dout("%p dirty_folio %p idx %lu head %d/%d -> %d/%d "
"snapc %p seq %lld (%d snaps)\n",
- mapping->host, page, page->index,
+ mapping->host, folio, folio->index,
ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
snapc, snapc->seq, snapc->num_snaps);
spin_unlock(&ci->i_ceph_lock);
/*
- * Reference snap context in page->private. Also set
- * PagePrivate so that we get invalidatepage callback.
+ * Reference snap context in folio->private. Also set
+ * PagePrivate so that we get invalidate_folio callback.
*/
- BUG_ON(PagePrivate(page));
- attach_page_private(page, snapc);
+ BUG_ON(folio_get_private(folio));
+ folio_attach_private(folio, snapc);
- return ceph_fscache_set_page_dirty(page);
+ return ceph_fscache_dirty_folio(mapping, folio);
}
/*
- * If we are truncating the full page (i.e. offset == 0), adjust the
- * dirty page counters appropriately. Only called if there is private
- * data on the page.
+ * If we are truncating the full folio (i.e. offset == 0), adjust the
+ * dirty folio counters appropriately. Only called if there is private
+ * data on the folio.
*/
-static void ceph_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ceph_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
- inode = page->mapping->host;
+ inode = folio->mapping->host;
ci = ceph_inode(inode);
- if (offset != 0 || length != thp_size(page)) {
- dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
- inode, page, page->index, offset, length);
+ if (offset != 0 || length != folio_size(folio)) {
+ dout("%p invalidate_folio idx %lu partial dirty page %zu~%zu\n",
+ inode, folio->index, offset, length);
return;
}
- WARN_ON(!PageLocked(page));
- if (PagePrivate(page)) {
- dout("%p invalidatepage %p idx %lu full dirty page\n",
- inode, page, page->index);
+ WARN_ON(!folio_test_locked(folio));
+ if (folio_get_private(folio)) {
+ dout("%p invalidate_folio idx %lu full dirty page\n",
+ inode, folio->index);
- snapc = detach_page_private(page);
+ snapc = folio_detach_private(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc);
}
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
static int ceph_releasepage(struct page *page, gfp_t gfp)
@@ -185,7 +184,7 @@ static int ceph_releasepage(struct page *page, gfp_t gfp)
static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
{
- struct inode *inode = rreq->mapping->host;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_layout *lo = &ci->i_layout;
u32 blockoff;
@@ -202,7 +201,7 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
{
- struct inode *inode = subreq->rreq->mapping->host;
+ struct inode *inode = subreq->rreq->inode;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
u64 objno, objoff;
@@ -245,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req)
iput(req->r_inode);
}
+static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq)
+{
+ struct netfs_read_request *rreq = subreq->rreq;
+ struct inode *inode = rreq->inode;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ struct ceph_mds_reply_info_in *iinfo;
+ struct ceph_mds_request *req;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct iov_iter iter;
+ ssize_t err = 0;
+ size_t len;
+
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ __clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);
+
+ if (subreq->start >= inode->i_size)
+ goto out;
+
+ /* We need to fetch the inline data. */
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_ino1 = ci->i_vino;
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
+ req->r_num_caps = 2;
+
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto out;
+
+ rinfo = &req->r_reply_info;
+ iinfo = &rinfo->targeti;
+ if (iinfo->inline_version == CEPH_INLINE_NONE) {
+ /* The data got uninlined */
+ ceph_mdsc_put_request(req);
+ return false;
+ }
+
+ len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
+ err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
+ if (err == 0)
+ err = -EFAULT;
+
+ ceph_mdsc_put_request(req);
+out:
+ netfs_subreq_terminated(subreq, err, false);
+ return true;
+}
+
static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
{
struct netfs_read_request *rreq = subreq->rreq;
- struct inode *inode = rreq->mapping->host;
+ struct inode *inode = rreq->inode;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
@@ -259,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
int err = 0;
u64 len = subreq->len;
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ ceph_netfs_issue_op_inline(subreq))
+ return;
+
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
0, 1, CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
@@ -327,23 +383,9 @@ static int ceph_readpage(struct file *file, struct page *subpage)
size_t len = folio_size(folio);
u64 off = folio_file_pos(folio);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- /*
- * Uptodate inline data should have been added
- * into page cache while getting Fcr caps.
- */
- if (off == 0) {
- folio_unlock(folio);
- return -EINVAL;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- folio_unlock(folio);
- return 0;
- }
-
- dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
- vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
+ dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d",
+ vino.ino, vino.snap, file, off, len, folio, folio_index(folio),
+ ci->i_inline_version != CEPH_INLINE_NONE);
return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
}
@@ -516,6 +558,7 @@ static u64 get_writepages_data_length(struct inode *inode,
*/
static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
@@ -550,8 +593,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */
if (page_off >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n", page, ceph_wbc.i_size);
- page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
+ dout("folio at %lu beyond eof %llu\n", folio->index,
+ ceph_wbc.i_size);
+ folio_invalidate(folio, 0, folio_size(folio));
return 0;
}
@@ -563,7 +607,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = true;
req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
@@ -623,7 +667,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ fsc->write_congested = false;
return err;
}
@@ -635,6 +679,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
BUG_ON(!inode);
ihold(inode);
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ ceph_inode_to_client(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
wait_on_page_fscache(page);
err = writepage_nounlock(page, wbc);
@@ -707,8 +755,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
+ fsc->write_congested = false;
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
@@ -760,6 +807,10 @@ static int ceph_writepages_start(struct address_space *mapping,
bool done = false;
bool caching = ceph_is_cache_enabled(inode);
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fsc->write_congested)
+ return 0;
+
dout("writepages_start %p (mode=%s)\n", inode,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
@@ -867,14 +918,16 @@ get_more_pages:
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- dout("%p page eof %llu\n",
- page, ceph_wbc.i_size);
+ struct folio *folio = page_folio(page);
+
+ dout("folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
- page_offset(page) >= i_size_read(inode)) &&
- clear_page_dirty_for_io(page))
- mapping->a_ops->invalidatepage(page,
- 0, thp_size(page));
- unlock_page(page);
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0,
+ folio_size(folio));
+ folio_unlock(folio);
continue;
}
if (strip_unit_end && (page->index > strip_unit_end)) {
@@ -954,11 +1007,8 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb)) {
- set_bdi_congested(inode_to_bdi(inode),
- BLK_RW_ASYNC);
- }
-
+ fsc->mount_options->congestion_kb))
+ fsc->write_congested = true;
pages[locked_pages++] = page;
pvec.pages[i] = NULL;
@@ -1274,45 +1324,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
struct inode *inode = file_inode(file);
- struct ceph_inode_info *ci = ceph_inode(inode);
struct folio *folio = NULL;
- pgoff_t index = pos >> PAGE_SHIFT;
int r;
- /*
- * Uninlining should have already been done and everything updated, EXCEPT
- * for inline_version sent to the MDS.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (aop_flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
- folio = __filemap_get_folio(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (!folio)
- return -ENOMEM;
-
- /*
- * The inline_version on a new inode is set to 1. If that's the
- * case, then the folio is brand new and isn't yet Uptodate.
- */
- r = 0;
- if (index == 0 && ci->i_inline_version != 1) {
- if (!folio_test_uptodate(folio)) {
- WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
- ci->i_inline_version);
- r = -EINVAL;
- }
- goto out;
- }
- zero_user_segment(&folio->page, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- goto out;
- }
-
r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
&ceph_netfs_read_ops, NULL);
-out:
if (r == 0)
folio_wait_fscache(folio);
if (r < 0) {
@@ -1372,8 +1388,8 @@ const struct address_space_operations ceph_aops = {
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
- .set_page_dirty = ceph_set_page_dirty,
- .invalidatepage = ceph_invalidatepage,
+ .dirty_folio = ceph_dirty_folio,
+ .invalidate_folio = ceph_invalidate_folio,
.releasepage = ceph_releasepage,
.direct_IO = noop_direct_IO,
};
@@ -1508,19 +1524,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- struct page *locked_page = NULL;
- if (off == 0) {
- lock_page(page);
- locked_page = page;
- }
- err = ceph_uninline_data(vma->vm_file, locked_page);
- if (locked_page)
- unlock_page(locked_page);
- if (err < 0)
- goto out_free;
- }
-
if (off + thp_size(page) <= size)
len = thp_size(page);
else
@@ -1577,11 +1580,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
ceph_put_snap_context(snapc);
} while (err == 0);
- if (ret == VM_FAULT_LOCKED ||
- ci->i_inline_version != CEPH_INLINE_NONE) {
+ if (ret == VM_FAULT_LOCKED) {
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1645,16 +1646,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
-int ceph_uninline_data(struct file *filp, struct page *locked_page)
+int ceph_uninline_data(struct file *file)
{
- struct inode *inode = file_inode(filp);
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_request *req;
- struct page *page = NULL;
- u64 len, inline_version;
+ struct ceph_cap_flush *prealloc_cf;
+ struct folio *folio = NULL;
+ u64 inline_version = CEPH_INLINE_NONE;
+ struct page *pages[1];
int err = 0;
- bool from_pagecache = false;
+ u64 len;
+
+ prealloc_cf = ceph_alloc_cap_flush();
+ if (!prealloc_cf)
+ return -ENOMEM;
+
+ folio = read_mapping_folio(inode->i_mapping, 0, file);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
+ goto out;
+ }
+
+ folio_lock(folio);
spin_lock(&ci->i_ceph_lock);
inline_version = ci->i_inline_version;
@@ -1665,45 +1680,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
if (inline_version == 1 || /* initial version, no data */
inline_version == CEPH_INLINE_NONE)
- goto out;
-
- if (locked_page) {
- page = locked_page;
- WARN_ON(!PageUptodate(page));
- } else if (ceph_caps_issued(ci) &
- (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
- page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- from_pagecache = true;
- lock_page(page);
- } else {
- put_page(page);
- page = NULL;
- }
- }
- }
+ goto out_unlock;
- if (page) {
- len = i_size_read(inode);
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
- } else {
- page = __page_cache_alloc(GFP_NOFS);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- err = __ceph_do_getattr(inode, page,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (err < 0) {
- /* no inline data */
- if (err == -ENODATA)
- err = 0;
- goto out;
- }
- len = err;
- }
+ len = i_size_read(inode);
+ if (len > folio_size(folio))
+ len = folio_size(folio);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 0, 1,
@@ -1711,7 +1692,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
NULL, 0, 0, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
req->r_mtime = inode->i_mtime;
@@ -1720,7 +1701,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
err = ceph_osdc_wait_request(&fsc->client->osdc, req);
ceph_osdc_put_request(req);
if (err < 0)
- goto out;
+ goto out_unlock;
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
ceph_vino(inode), 0, &len, 1, 3,
@@ -1729,10 +1710,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
- goto out;
+ goto out_unlock;
}
- osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
+ pages[0] = folio_page(folio, 0);
+ osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);
{
__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1742,7 +1724,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
CEPH_OSD_CMPXATTR_OP_GT,
CEPH_OSD_CMPXATTR_MODE_U64);
if (err)
- goto out_put;
+ goto out_put_req;
}
{
@@ -1753,7 +1735,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
"inline_version",
xattr_buf, xattr_len, 0, 0);
if (err)
- goto out_put;
+ goto out_put_req;
}
req->r_mtime = inode->i_mtime;
@@ -1764,19 +1746,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
-out_put:
+ if (!err) {
+ int dirty;
+
+ /* Set to CAP_INLINE_NONE and dirty the caps */
+ down_read(&fsc->mdsc->snap_rwsem);
+ spin_lock(&ci->i_ceph_lock);
+ ci->i_inline_version = CEPH_INLINE_NONE;
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
+ spin_unlock(&ci->i_ceph_lock);
+ up_read(&fsc->mdsc->snap_rwsem);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+out_put_req:
ceph_osdc_put_request(req);
if (err == -ECANCELED)
err = 0;
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
out:
- if (page && page != locked_page) {
- if (from_pagecache) {
- unlock_page(page);
- put_page(page);
- } else
- __free_pages(page, 0);
- }
-
+ ceph_free_cap_flush(prealloc_cf);
dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
inode, ceph_vinop(inode), inline_version, err);
return err;
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index 09164389fa66..b90f3016994d 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -54,12 +54,12 @@ static inline void ceph_fscache_unpin_writeback(struct inode *inode,
fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline int ceph_fscache_set_page_dirty(struct page *page)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_info *ci = ceph_inode(mapping->host);
- return fscache_set_page_dirty(page, ceph_fscache_cookie(ci));
+ return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
}
static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq)
@@ -133,9 +133,10 @@ static inline void ceph_fscache_unpin_writeback(struct inode *inode,
{
}
-static inline int ceph_fscache_set_page_dirty(struct page *page)
+static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- return __set_page_dirty_nobuffers(page);
+ return filemap_dirty_folio(mapping, folio);
}
static inline bool ceph_is_cache_enabled(struct inode *inode)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b472cd066d1c..f1ad6884d4da 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
ceph_get_mds_session(session);
spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ /* Don't send messages until we get async create reply */
+ spin_unlock(&ci->i_ceph_lock);
+ ceph_put_mds_session(session);
+ return;
+ }
+
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
retry:
@@ -2409,6 +2416,9 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
dout("write_inode %p wait=%d\n", inode, wait);
ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
+ err = ceph_wait_on_async_create(inode);
+ if (err)
+ return err;
dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
@@ -2439,6 +2449,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
u64 first_tid = 0;
u64 last_snap_flush = 0;
+ /* Don't do anything until create reply comes in */
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
+ return;
+
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
@@ -4152,7 +4166,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
- ci = ceph_inode(inode);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
@@ -4178,6 +4191,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
goto flush_cap_releases;
}
+ ci = ceph_inode(inode);
/* these will work even if we don't have a cap yet */
switch (op) {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3cf7c9c1085b..bec3c4549c07 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_client_metric *cm = &fsc->mdsc->metric;
struct ceph_metric *m;
- s64 total, sum, avg, min, max, sq;
+ s64 total, avg, min, max, sq;
int i;
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
@@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
m = &cm->metric[i];
spin_lock(&m->lock);
total = m->total;
- sum = m->latency_sum;
- avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
+ avg = m->latency_avg;
min = m->latency_min;
max = m->latency_max;
sq = m->latency_sq_sum;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 133dbd9338e7..eae417d71136 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_mutex, no need to use page lock */
+ i_rwsem, no need to use page lock */
unlock_page(cache_ctl->page);
cache_ctl->dentries = kmap(cache_ctl->page);
}
@@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
- * marked as complete while not holding the i_mutex. */
+ * marked as complete while not holding the i_rwsem. */
if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
dentry = cache_ctl->dentries[cache_ctl->index];
else
@@ -478,8 +478,11 @@ more:
2 : (fpos_off(rde->offset) + 1);
err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
- if (err)
+ if (err) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
return err;
+ }
} else if (req->r_reply_info.dir_end) {
dfi->next_offset = 2;
/* keep last name */
@@ -520,6 +523,12 @@ more:
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
le32_to_cpu(rde->inode.in->mode) >> 12)) {
+ /*
+ * NOTE: Here no need to put the 'dfi->last_readdir',
+ * because when dir_emit stops us it's most likely
+ * doesn't have enough memory, etc. So for next readdir
+ * it will continue.
+ */
dout("filldir stopping us...\n");
return 0;
}
@@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
- struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
+ struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
/* .snap dir? */
if (ceph_snap(parent) == CEPH_NOSNAP &&
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index bbed3224ad68..feb75eb1cd82 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -207,6 +207,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
struct ceph_mount_options *opt =
ceph_inode_to_client(&ci->vfs_inode)->mount_options;
struct ceph_file_info *fi;
+ int ret;
dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
inode->i_mode, isdir ? "dir" : "regular");
@@ -240,7 +241,22 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
INIT_LIST_HEAD(&fi->rw_contexts);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
+ if ((file->f_mode & FMODE_WRITE) &&
+ ci->i_inline_version != CEPH_INLINE_NONE) {
+ ret = ceph_uninline_data(file);
+ if (ret < 0)
+ goto error;
+ }
+
return 0;
+
+error:
+ ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
+ ceph_put_fmode(ci, fi->fmode, 1);
+ kmem_cache_free(ceph_file_cachep, fi);
+ /* wake up anyone waiting for caps on this inode */
+ wake_up_all(&ci->i_cap_wq);
+ return ret;
}
/*
@@ -516,52 +532,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
}
}
+static void wake_async_create_waiters(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
+ wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+ }
+ ceph_kick_flushing_inode_caps(session, ci);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
+ struct dentry *dentry = req->r_dentry;
+ struct inode *dinode = d_inode(dentry);
+ struct inode *tinode = req->r_target_inode;
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
+ WARN_ON_ONCE(dinode && tinode && dinode != tinode);
+
+ /* MDS changed -- caller must resubmit */
if (result == -EJUKEBOX)
goto out;
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
- struct dentry *dentry = req->r_dentry;
- struct inode *inode = d_inode(dentry);
int pathlen = 0;
u64 base = 0;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
+ pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
+ base, IS_ERR(path) ? "<<bad>>" : path, result);
+ ceph_mdsc_free_path(path, pathlen);
+
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
d_drop(dentry);
- ceph_inode_shutdown(inode);
-
- pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
- base, IS_ERR(path) ? "<<bad>>" : path, result);
- ceph_mdsc_free_path(path, pathlen);
+ if (dinode) {
+ mapping_set_error(dinode->i_mapping, result);
+ ceph_inode_shutdown(dinode);
+ wake_async_create_waiters(dinode, req->r_session);
+ }
}
- if (req->r_target_inode) {
- struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
- u64 ino = ceph_vino(req->r_target_inode).ino;
+ if (tinode) {
+ u64 ino = ceph_vino(tinode).ino;
if (req->r_deleg_ino != ino)
pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
__func__, req->r_err, req->r_deleg_ino, ino);
- mapping_set_error(req->r_target_inode->i_mapping, result);
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
- ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
- wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
- }
- ceph_kick_flushing_inode_caps(req->r_session, ci);
- spin_unlock(&ci->i_ceph_lock);
+ mapping_set_error(tinode->i_mapping, result);
+ wake_async_create_waiters(tinode, req->r_session);
} else if (!result) {
pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
req->r_deleg_ino);
@@ -1041,7 +1072,6 @@ static void ceph_aio_complete(struct inode *inode,
}
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&aio_req->prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -1778,12 +1808,6 @@ retry_snap:
if (err)
goto out;
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- err = ceph_uninline_data(file, NULL);
- if (err < 0)
- goto out;
- }
-
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
@@ -1855,7 +1879,6 @@ retry_snap:
int dirty;
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -2109,12 +2132,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
- if (ci->i_inline_version != CEPH_INLINE_NONE) {
- ret = ceph_uninline_data(file, NULL);
- if (ret < 0)
- goto unlock;
- }
-
size = i_size_read(inode);
/* Are we punching a hole beyond EOF? */
@@ -2139,7 +2156,6 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) {
spin_lock(&ci->i_ceph_lock);
- ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
@@ -2532,7 +2548,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
- dst_ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
spin_unlock(&dst_ci->i_ceph_lock);
if (dirty)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ef4a980a7bf3..d80911dc91c2 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent)
if (!S_ISDIR(parent->i_mode)) {
pr_warn_once("bad snapdir parent type (mode=0%o)\n",
parent->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once("bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
- return ERR_PTR(-ENOTDIR);
+ goto err;
}
inode->i_mode = parent->i_mode;
@@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
return inode;
+err:
+ if ((inode->i_state & I_NEW))
+ discard_new_inode(inode);
+ else
+ iput(inode);
+ return ERR_PTR(-ENOTDIR);
}
const struct inode_operations ceph_file_iops = {
@@ -447,7 +453,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
struct ceph_inode_info *ci;
int i;
- ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
+ ci = alloc_inode_sb(sb, ceph_inode_cachep, GFP_NOFS);
if (!ci)
return NULL;
@@ -1201,7 +1207,7 @@ out_unlock:
/*
* splice a dentry to an inode.
- * caller must hold directory i_mutex for this to be safe.
+ * caller must hold directory i_rwsem for this to be safe.
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
@@ -1598,7 +1604,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
- * i_mutex, no need to use page lock */
+ * i_rwsem, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
@@ -2301,6 +2307,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
return err;
}
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
+ size_t size)
+{
+ struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct ceph_mds_request *req;
+ int mode = USE_AUTH_MDS;
+ int err;
+ char *xattr_value;
+ size_t xattr_value_len;
+
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
+ if (IS_ERR(req)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ req->r_path2 = kstrdup(name, GFP_NOFS);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto put;
+ }
+
+ ihold(inode);
+ req->r_inode = inode;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0)
+ goto put;
+
+ xattr_value = req->r_reply_info.xattr_info.xattr_value;
+ xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
+
+ dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
+
+ err = (int)xattr_value_len;
+ if (size == 0)
+ goto put;
+
+ if (xattr_value_len > size) {
+ err = -ERANGE;
+ goto put;
+ }
+
+ memcpy(value, xattr_value, xattr_value_len);
+put:
+ ceph_mdsc_put_request(req);
+out:
+ dout("do_getvxattr result=%d\n", err);
+ return err;
+}
+
/*
* Check inode permissions. We verify we have a valid value for
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d1f154aec249..3e2843e86e27 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
- if (wait)
- req->r_wait_for_completion = ceph_lock_wait_for_completion;
-
- err = ceph_mdsc_do_request(mdsc, inode, req);
+ err = ceph_mdsc_submit_request(mdsc, inode, req);
+ if (!err)
+ err = ceph_mdsc_wait_request(mdsc, req, wait ?
+ ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index c30eefc0ac19..fa38c013126d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -555,6 +555,28 @@ bad:
return -EIO;
}
+static int parse_reply_info_getvxattr(void **p, void *end,
+ struct ceph_mds_reply_info_parsed *info,
+ u64 features)
+{
+ u32 value_len;
+
+ ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
+ ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
+ ceph_decode_skip_32(p, end, bad); /* skip payload length */
+
+ ceph_decode_32_safe(p, end, value_len, bad);
+
+ if (value_len == end - *p) {
+ info->xattr_info.xattr_value = *p;
+ info->xattr_info.xattr_value_len = value_len;
+ *p = end;
+ return value_len;
+ }
+bad:
+ return -EIO;
+}
+
/*
* parse extra results
*/
@@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end,
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features, s);
+ else if (op == CEPH_MDS_OP_GETVXATTR)
+ return parse_reply_info_getvxattr(p, end, info, features);
else
return -EIO;
}
@@ -2178,7 +2202,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
- __GFP_NOWARN,
+ __GFP_NOWARN |
+ __GFP_ZERO,
order);
if (rinfo->dir_entries)
break;
@@ -2946,15 +2971,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
return err;
}
-static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
- struct ceph_mds_request *req)
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func)
{
int err;
/* wait */
dout("do_request waiting\n");
- if (!req->r_timeout && req->r_wait_for_completion) {
- err = req->r_wait_for_completion(mdsc, req);
+ if (wait_func) {
+ err = wait_func(mdsc, req);
} else {
long timeleft = wait_for_completion_killable_timeout(
&req->r_completion,
@@ -3011,7 +3037,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
- err = ceph_mdsc_wait_request(mdsc, req);
+ err = ceph_mdsc_wait_request(mdsc, req, NULL);
dout("do_request %p done, result %d\n", req, err);
return err;
}
@@ -3097,35 +3123,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
result = le32_to_cpu(head->result);
- /*
- * Handle an ESTALE
- * if we're not talking to the authority, send to them
- * if the authority has changed while we weren't looking,
- * send to new authority
- * Otherwise we just have to return an ESTALE
- */
- if (result == -ESTALE) {
- dout("got ESTALE on request %llu\n", req->r_tid);
- req->r_resend_mds = -1;
- if (req->r_direct_mode != USE_AUTH_MDS) {
- dout("not using auth, setting for that now\n");
- req->r_direct_mode = USE_AUTH_MDS;
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- } else {
- int mds = __choose_mds(mdsc, req, NULL);
- if (mds >= 0 && mds != req->r_session->s_mds) {
- dout("but auth changed, so resending\n");
- __do_request(mdsc, req);
- mutex_unlock(&mdsc->mutex);
- goto out;
- }
- }
- dout("have to return ESTALE on request %llu\n", req->r_tid);
- }
-
-
if (head->safe) {
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
@@ -4841,7 +4838,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_cleanup_snapid_map(mdsc);
- ceph_cleanup_empty_realms(mdsc);
+ ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 97c7f7bfa55f..33497846e47e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -100,6 +100,11 @@ struct ceph_mds_reply_dir_entry {
loff_t offset;
};
+struct ceph_mds_reply_xattr {
+ char *xattr_value;
+ size_t xattr_value_len;
+};
+
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
@@ -115,6 +120,7 @@ struct ceph_mds_reply_info_parsed {
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
+ struct ceph_mds_reply_xattr xattr_info;
/* extra */
union {
@@ -274,8 +280,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
- const struct cred *r_cred;
int r_request_release_offset;
+ const struct cred *r_cred;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
@@ -296,12 +302,11 @@ struct ceph_mds_request {
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
-
+ u32 r_readdir_offset;
struct page *r_locked_page;
int r_dir_caps;
int r_num_caps;
- u32 r_readdir_offset;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
@@ -329,7 +334,6 @@ struct ceph_mds_request {
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
- ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
long long r_dir_release_cnt;
@@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
+int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
+ struct ceph_mds_request *req,
+ ceph_mds_request_wait_callback_t wait_func);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c
index 0fcba68f9a99..c47347d2e84e 100644
--- a/fs/ceph/metric.c
+++ b/fs/ceph/metric.c
@@ -8,6 +8,12 @@
#include "metric.h"
#include "mds_client.h"
+static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
+{
+ struct timespec64 t = ktime_to_timespec64(val);
+ ceph_encode_timespec64(ts, &t);
+}
+
static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
@@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
u64 nr_caps = atomic64_read(&m->total_caps);
u32 header_len = sizeof(struct ceph_metric_header);
struct ceph_msg *msg;
- struct timespec64 ts;
s64 sum;
s32 items = 0;
s32 len;
@@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the read latency metric */
read = (struct ceph_metric_read_latency *)(cap + 1);
read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
- read->header.ver = 1;
+ read->header.ver = 2;
read->header.compat = 1;
read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
sum = m->metric[METRIC_READ].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- read->sec = cpu_to_le32(ts.tv_sec);
- read->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&read->lat, sum);
+ ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
+ read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
+ read->count = cpu_to_le64(m->metric[METRIC_READ].total);
items++;
/* encode the write latency metric */
write = (struct ceph_metric_write_latency *)(read + 1);
write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
- write->header.ver = 1;
+ write->header.ver = 2;
write->header.compat = 1;
write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
sum = m->metric[METRIC_WRITE].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- write->sec = cpu_to_le32(ts.tv_sec);
- write->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&write->lat, sum);
+ ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
+ write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
+ write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
items++;
/* encode the metadata latency metric */
meta = (struct ceph_metric_metadata_latency *)(write + 1);
meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
- meta->header.ver = 1;
+ meta->header.ver = 2;
meta->header.compat = 1;
meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
sum = m->metric[METRIC_METADATA].latency_sum;
- jiffies_to_timespec64(sum, &ts);
- meta->sec = cpu_to_le32(ts.tv_sec);
- meta->nsec = cpu_to_le32(ts.tv_nsec);
+ ktime_to_ceph_timespec(&meta->lat, sum);
+ ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
+ meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
+ meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
items++;
/* encode the dentry lease metric */
@@ -250,6 +258,7 @@ int ceph_metric_init(struct ceph_client_metric *m)
metric->size_max = 0;
metric->total = 0;
metric->latency_sum = 0;
+ metric->latency_avg = 0;
metric->latency_sq_sum = 0;
metric->latency_min = KTIME_MAX;
metric->latency_max = 0;
@@ -307,20 +316,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
max = new; \
}
-static inline void __update_stdev(ktime_t total, ktime_t lsum,
- ktime_t *sq_sump, ktime_t lat)
+static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
+ ktime_t *sq_sump, ktime_t lat)
{
- ktime_t avg, sq;
-
- if (unlikely(total == 1))
- return;
-
- /* the sq is (lat - old_avg) * (lat - new_avg) */
- avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1));
- sq = lat - avg;
- avg = DIV64_U64_ROUND_CLOSEST(lsum, total);
- sq = sq * (lat - avg);
- *sq_sump += sq;
+ ktime_t avg;
+
+ if (unlikely(total == 1)) {
+ *lavg = lat;
+ } else {
+ /* the sq is (lat - old_avg) * (lat - new_avg) */
+ avg = *lavg + div64_s64(lat - *lavg, total);
+ *sq_sump += (lat - *lavg)*(lat - avg);
+ *lavg = avg;
+ }
}
void ceph_update_metrics(struct ceph_metric *m,
@@ -339,6 +347,7 @@ void ceph_update_metrics(struct ceph_metric *m,
METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
m->latency_sum += lat;
METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
- __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat);
+ __update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum,
+ lat);
spin_unlock(&m->lock);
}
diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h
index bb45608181e7..0d0c44bd3332 100644
--- a/fs/ceph/metric.h
+++ b/fs/ceph/metric.h
@@ -2,7 +2,7 @@
#ifndef _FS_CEPH_MDS_METRIC_H
#define _FS_CEPH_MDS_METRIC_H
-#include <linux/types.h>
+#include <linux/ceph/types.h>
#include <linux/percpu_counter.h>
#include <linux/ktime.h>
@@ -19,27 +19,39 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_OPENED_INODES,
CLIENT_METRIC_TYPE_READ_IO_SIZES,
CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
-
- CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
+
+ CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
};
/*
* This will always have the highest metric bit value
* as the last element of the array.
*/
-#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
- CLIENT_METRIC_TYPE_CAP_INFO, \
- CLIENT_METRIC_TYPE_READ_LATENCY, \
- CLIENT_METRIC_TYPE_WRITE_LATENCY, \
- CLIENT_METRIC_TYPE_METADATA_LATENCY, \
- CLIENT_METRIC_TYPE_DENTRY_LEASE, \
- CLIENT_METRIC_TYPE_OPENED_FILES, \
- CLIENT_METRIC_TYPE_PINNED_ICAPS, \
- CLIENT_METRIC_TYPE_OPENED_INODES, \
- CLIENT_METRIC_TYPE_READ_IO_SIZES, \
- CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
- \
- CLIENT_METRIC_TYPE_MAX, \
+#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
+ CLIENT_METRIC_TYPE_CAP_INFO, \
+ CLIENT_METRIC_TYPE_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_DENTRY_LEASE, \
+ CLIENT_METRIC_TYPE_OPENED_FILES, \
+ CLIENT_METRIC_TYPE_PINNED_ICAPS, \
+ CLIENT_METRIC_TYPE_OPENED_INODES, \
+ CLIENT_METRIC_TYPE_READ_IO_SIZES, \
+ CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
+ CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
+ CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
+ CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
+ \
+ CLIENT_METRIC_TYPE_MAX, \
}
struct ceph_metric_header {
@@ -60,22 +72,28 @@ struct ceph_metric_cap {
/* metric read latency header */
struct ceph_metric_read_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric write latency header */
struct ceph_metric_write_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric metadata latency header */
struct ceph_metric_metadata_latency {
struct ceph_metric_header header;
- __le32 sec;
- __le32 nsec;
+ struct ceph_timespec lat;
+ struct ceph_timespec avg;
+ __le64 sq_sum;
+ __le64 count;
} __packed;
/* metric dentry lease header */
@@ -140,6 +158,7 @@ struct ceph_metric {
u64 size_min;
u64 size_max;
ktime_t latency_sum;
+ ktime_t latency_avg;
ktime_t latency_sq_sum;
ktime_t latency_min;
ktime_t latency_max;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index b41e6724c591..322ee5add942 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -121,18 +121,23 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 1); /* for caller */
+ /* Do not release the global dummy snaprealm until unmouting */
+ if (ino == CEPH_INO_GLOBAL_SNAPREALM)
+ atomic_set(&realm->nref, 2);
+ else
+ atomic_set(&realm->nref, 1);
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
INIT_LIST_HEAD(&realm->empty_item);
INIT_LIST_HEAD(&realm->dirty_item);
+ INIT_LIST_HEAD(&realm->rebuild_item);
INIT_LIST_HEAD(&realm->inodes_with_caps);
spin_lock_init(&realm->inodes_with_caps_lock);
__insert_snap_realm(&mdsc->snap_realms, realm);
mdsc->num_snap_realms++;
- dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ dout("%s %llx %p\n", __func__, realm->ino, realm);
return realm;
}
@@ -156,7 +161,7 @@ static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
else if (ino > r->ino)
n = n->rb_right;
else {
- dout("lookup_snap_realm %llx %p\n", r->ino, r);
+ dout("%s %llx %p\n", __func__, r->ino, r);
return r;
}
}
@@ -184,7 +189,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
{
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+ dout("%s %p %llx\n", __func__, realm, realm->ino);
rb_erase(&realm->node, &mdsc->snap_realms);
mdsc->num_snap_realms--;
@@ -260,9 +265,14 @@ static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_empty_lock);
}
-void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc)
{
+ struct ceph_snap_realm *global_realm;
+
down_write(&mdsc->snap_rwsem);
+ global_realm = __lookup_snap_realm(mdsc, CEPH_INO_GLOBAL_SNAPREALM);
+ if (global_realm)
+ ceph_put_snap_realm(mdsc, global_realm);
__cleanup_empty_realms(mdsc);
up_write(&mdsc->snap_rwsem);
}
@@ -292,9 +302,8 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
if (IS_ERR(parent))
return PTR_ERR(parent);
}
- dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
- realm->ino, realm, realm->parent_ino, realm->parent,
- parentino, parent);
+ dout("%s %llx %p: %llx %p -> %llx %p\n", __func__, realm->ino,
+ realm, realm->parent_ino, realm->parent, parentino, parent);
if (realm->parent) {
list_del_init(&realm->child_item);
ceph_put_snap_realm(mdsc, realm->parent);
@@ -320,7 +329,8 @@ static int cmpu64_rev(const void *a, const void *b)
* build the snap context for a given realm.
*/
static int build_snap_context(struct ceph_snap_realm *realm,
- struct list_head* dirty_realms)
+ struct list_head *realm_queue,
+ struct list_head *dirty_realms)
{
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
@@ -334,9 +344,9 @@ static int build_snap_context(struct ceph_snap_realm *realm,
*/
if (parent) {
if (!parent->cached_context) {
- err = build_snap_context(parent, dirty_realms);
- if (err)
- goto fail;
+ /* add to the queue head */
+ list_add(&parent->rebuild_item, realm_queue);
+ return 1;
}
num += parent->cached_context->num_snaps;
}
@@ -349,9 +359,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
realm->cached_context->seq == realm->seq &&
(!parent ||
realm->cached_context->seq >= parent->cached_context->seq)) {
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)"
- " (unchanged)\n",
- realm->ino, realm, realm->cached_context,
+ dout("%s %llx %p: %p seq %lld (%u snaps) (unchanged)\n",
+ __func__, realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
(unsigned int)realm->cached_context->num_snaps);
return 0;
@@ -390,9 +399,8 @@ static int build_snap_context(struct ceph_snap_realm *realm,
sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
snapc->num_snaps = num;
- dout("build_snap_context %llx %p: %p seq %lld (%u snaps)\n",
- realm->ino, realm, snapc, snapc->seq,
- (unsigned int) snapc->num_snaps);
+ dout("%s %llx %p: %p seq %lld (%u snaps)\n", __func__, realm->ino,
+ realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps);
ceph_put_snap_context(realm->cached_context);
realm->cached_context = snapc;
@@ -409,8 +417,7 @@ fail:
ceph_put_snap_context(realm->cached_context);
realm->cached_context = NULL;
}
- pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
- realm, err);
+ pr_err("%s %llx %p fail %d\n", __func__, realm->ino, realm, err);
return err;
}
@@ -420,13 +427,50 @@ fail:
static void rebuild_snap_realms(struct ceph_snap_realm *realm,
struct list_head *dirty_realms)
{
- struct ceph_snap_realm *child;
+ LIST_HEAD(realm_queue);
+ int last = 0;
+ bool skip = false;
+
+ list_add_tail(&realm->rebuild_item, &realm_queue);
+
+ while (!list_empty(&realm_queue)) {
+ struct ceph_snap_realm *_realm, *child;
+
+ _realm = list_first_entry(&realm_queue,
+ struct ceph_snap_realm,
+ rebuild_item);
+
+ /*
+ * If the last building failed dues to memory
+ * issue, just empty the realm_queue and return
+ * to avoid infinite loop.
+ */
+ if (last < 0) {
+ list_del_init(&_realm->rebuild_item);
+ continue;
+ }
+
+ last = build_snap_context(_realm, &realm_queue, dirty_realms);
+ dout("%s %llx %p, %s\n", __func__, _realm->ino, _realm,
+ last > 0 ? "is deferred" : !last ? "succeeded" : "failed");
+
+ /* is any child in the list ? */
+ list_for_each_entry(child, &_realm->children, child_item) {
+ if (!list_empty(&child->rebuild_item)) {
+ skip = true;
+ break;
+ }
+ }
- dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm, dirty_realms);
+ if (!skip) {
+ list_for_each_entry(child, &_realm->children, child_item)
+ list_add_tail(&child->rebuild_item, &realm_queue);
+ }
- list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child, dirty_realms);
+ /* last == 1 means need to build parent first */
+ if (last <= 0)
+ list_del_init(&_realm->rebuild_item);
+ }
}
@@ -474,23 +518,15 @@ static bool has_new_snaps(struct ceph_snap_context *o,
* Caller must hold snap_rwsem for read (i.e., the realm topology won't
* change).
*/
-static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
+static void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap **pcapsnap)
{
struct inode *inode = &ci->vfs_inode;
- struct ceph_cap_snap *capsnap;
struct ceph_snap_context *old_snapc, *new_snapc;
+ struct ceph_cap_snap *capsnap = *pcapsnap;
struct ceph_buffer *old_blob = NULL;
int used, dirty;
- capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
- if (!capsnap) {
- pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
- return;
- }
- capsnap->cap_flush.is_capsnap = true;
- INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
- INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
-
spin_lock(&ci->i_ceph_lock);
used = __ceph_caps_used(ci);
dirty = __ceph_caps_dirty(ci);
@@ -511,12 +547,14 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
as no new writes are allowed to start when pending, so any
writes in progress now were started before the previous
cap_snap. lucky us. */
- dout("queue_cap_snap %p already pending\n", inode);
+ dout("%s %p %llx.%llx already pending\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
if (ci->i_wrbuffer_ref_head == 0 &&
!(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) {
- dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ dout("%s %p %llx.%llx nothing dirty|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
@@ -536,20 +574,17 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
} else {
if (!(used & CEPH_CAP_FILE_WR) &&
ci->i_wrbuffer_ref_head == 0) {
- dout("queue_cap_snap %p "
- "no new_snap|dirty_page|writing\n", inode);
+ dout("%s %p %llx.%llx no new_snap|dirty_page|writing\n",
+ __func__, inode, ceph_vinop(inode));
goto update_snapc;
}
}
- dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n",
- inode, capsnap, old_snapc, ceph_cap_string(dirty),
- capsnap->need_flush ? "" : "no_flush");
+ dout("%s %p %llx.%llx cap_snap %p queuing under %p %s %s\n",
+ __func__, inode, ceph_vinop(inode), capsnap, old_snapc,
+ ceph_cap_string(dirty), capsnap->need_flush ? "" : "no_flush");
ihold(inode);
- refcount_set(&capsnap->nref, 1);
- INIT_LIST_HEAD(&capsnap->ci_item);
-
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
capsnap->dirty = dirty;
@@ -579,31 +614,30 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci)
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
if (used & CEPH_CAP_FILE_WR) {
- dout("queue_cap_snap %p cap_snap %p snapc %p"
- " seq %llu used WR, now pending\n", inode,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p seq %llu used WR,"
+ " now pending\n", __func__, inode, ceph_vinop(inode),
capsnap, old_snapc, old_snapc->seq);
capsnap->writing = 1;
} else {
/* note mtime, size NOW. */
__ceph_finish_cap_snap(ci, capsnap);
}
- capsnap = NULL;
+ *pcapsnap = NULL;
old_snapc = NULL;
update_snapc:
- if (ci->i_wrbuffer_ref_head == 0 &&
- ci->i_wr_ref == 0 &&
- ci->i_dirty_caps == 0 &&
- ci->i_flushing_caps == 0) {
- ci->i_head_snapc = NULL;
- } else {
+ if (ci->i_wrbuffer_ref_head == 0 &&
+ ci->i_wr_ref == 0 &&
+ ci->i_dirty_caps == 0 &&
+ ci->i_flushing_caps == 0) {
+ ci->i_head_snapc = NULL;
+ } else {
ci->i_head_snapc = ceph_get_snap_context(new_snapc);
dout(" new snapc is %p\n", new_snapc);
}
spin_unlock(&ci->i_ceph_lock);
ceph_buffer_put(old_blob);
- kfree(capsnap);
ceph_put_snap_context(old_snapc);
}
@@ -632,27 +666,28 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->truncate_size = ci->i_truncate_size;
capsnap->truncate_seq = ci->i_truncate_seq;
if (capsnap->dirty_pages) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "still has %d dirty pages\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size,
- capsnap->dirty_pages);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "still has %d dirty pages\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size, capsnap->dirty_pages);
return 0;
}
/* Fb cap still in use, delay it */
if (ci->i_wb_ref) {
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
- "used WRBUFFER, delaying\n", inode, capsnap,
- capsnap->context, capsnap->context->seq,
- ceph_cap_string(capsnap->dirty), capsnap->size);
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu "
+ "used WRBUFFER, delaying\n", __func__, inode,
+ ceph_vinop(inode), capsnap, capsnap->context,
+ capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+ capsnap->size);
capsnap->writing = 1;
return 0;
}
ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
- dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
- inode, capsnap, capsnap->context,
+ dout("%s %p %llx.%llx cap_snap %p snapc %p %llu %s s=%llu\n",
+ __func__, inode, ceph_vinop(inode), capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
capsnap->size);
@@ -671,8 +706,9 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
- dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+ dout("%s %p %llx inode\n", __func__, realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
@@ -682,13 +718,35 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
lastinode = inode;
- ceph_queue_cap_snap(ci);
+
+ /*
+ * Allocate the capsnap memory outside of ceph_queue_cap_snap()
+ * to reduce very possible but unnecessary frequently memory
+ * allocate/free in this loop.
+ */
+ if (!capsnap) {
+ capsnap = kmem_cache_zalloc(ceph_cap_snap_cachep, GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n",
+ inode);
+ return;
+ }
+ }
+ capsnap->cap_flush.is_capsnap = true;
+ refcount_set(&capsnap->nref, 1);
+ INIT_LIST_HEAD(&capsnap->cap_flush.i_list);
+ INIT_LIST_HEAD(&capsnap->cap_flush.g_list);
+ INIT_LIST_HEAD(&capsnap->ci_item);
+
+ ceph_queue_cap_snap(ci, &capsnap);
spin_lock(&realm->inodes_with_caps_lock);
}
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
- dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+ if (capsnap)
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
+ dout("%s %p %llx done\n", __func__, realm, realm->ino);
}
/*
@@ -707,14 +765,16 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
__le64 *prior_parent_snaps; /* encoded */
struct ceph_snap_realm *realm = NULL;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
lockdep_assert_held_write(&mdsc->snap_rwsem);
- dout("update_snap_trace deletion=%d\n", deletion);
+ dout("%s deletion=%d\n", __func__, deletion);
more:
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -738,10 +798,10 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
- dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ dout("%s updating %llx %p %lld -> %lld\n", __func__,
realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
/* update realm parameters, snap lists */
realm->seq = le64_to_cpu(ri->seq);
@@ -763,22 +823,30 @@ more:
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
- dout("update_snap_trace %llx %p seq %lld new\n",
+ dout("%s %llx %p seq %lld new\n", __func__,
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
- dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ dout("%s %llx %p seq %lld unchanged\n", __func__,
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -814,7 +882,7 @@ fail:
ceph_put_snap_realm(mdsc, realm);
if (first_realm)
ceph_put_snap_realm(mdsc, first_realm);
- pr_err("update_snap_trace error %d\n", err);
+ pr_err("%s error %d\n", __func__, err);
return err;
}
@@ -831,7 +899,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
struct inode *inode;
struct ceph_mds_session *session = NULL;
- dout("flush_snaps\n");
+ dout("%s\n", __func__);
spin_lock(&mdsc->snap_flush_lock);
while (!list_empty(&mdsc->snap_flush_list)) {
ci = list_first_entry(&mdsc->snap_flush_list,
@@ -846,7 +914,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
spin_unlock(&mdsc->snap_flush_lock);
ceph_put_mds_session(session);
- dout("flush_snaps done\n");
+ dout("%s done\n", __func__);
}
/**
@@ -928,8 +996,8 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
trace_len = le32_to_cpu(h->trace_len);
p += sizeof(*h);
- dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
- ceph_snap_op_name(op), split, trace_len);
+ dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
+ mds, ceph_snap_op_name(op), split, trace_len);
mutex_lock(&session->s_mutex);
inc_session_sequence(session);
@@ -989,13 +1057,13 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
*/
if (ci->i_snap_realm->created >
le64_to_cpu(ri->created)) {
- dout(" leaving %p in newer realm %llx %p\n",
- inode, ci->i_snap_realm->ino,
+ dout(" leaving %p %llx.%llx in newer realm %llx %p\n",
+ inode, ceph_vinop(inode), ci->i_snap_realm->ino,
ci->i_snap_realm);
goto skip_inode;
}
- dout(" will move %p to split realm %llx %p\n",
- inode, realm->ino, realm);
+ dout(" will move %p %llx.%llx to split realm %llx %p\n",
+ inode, ceph_vinop(inode), realm->ino, realm);
ceph_get_snap_realm(mdsc, realm);
ceph_change_snap_realm(inode, realm);
@@ -1038,7 +1106,7 @@ skip_inode:
return;
bad:
- pr_err("corrupt snap message from mds%d\n", mds);
+ pr_err("%s corrupt snap message from mds%d\n", __func__, mds);
ceph_msg_dump(msg);
out:
if (locked_rwsem)
@@ -1071,7 +1139,8 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
}
spin_unlock(&mdsc->snapid_map_lock);
if (exist) {
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
@@ -1115,11 +1184,13 @@ struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
if (exist) {
free_anon_bdev(sm->dev);
kfree(sm);
- dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+ dout("%s found snapid map %llx -> %x\n", __func__,
+ exist->snap, exist->dev);
return exist;
}
- dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+ dout("%s create snapid map %llx -> %x\n", __func__,
+ sm->snap, sm->dev);
return sm;
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 573bb9556fb5..e36e8948e728 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_GETVXATTR: return "getvxattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index bf79f369aec6..e6987d295079 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -802,6 +802,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0);
+ fsc->write_congested = false;
err = -ENOMEM;
/*
@@ -864,6 +865,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_cap_snap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
@@ -892,6 +894,9 @@ static int __init init_caches(void)
ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
if (!ceph_cap_cachep)
goto bad_cap;
+ ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
+ if (!ceph_cap_snap_cachep)
+ goto bad_cap_snap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (!ceph_cap_flush_cachep)
@@ -931,6 +936,8 @@ bad_file:
bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
+ kmem_cache_destroy(ceph_cap_snap_cachep);
+bad_cap_snap:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
@@ -947,6 +954,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_cap_snap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 67f145e1ae7a..a1ecc410a495 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -121,6 +121,7 @@ struct ceph_fs_client {
struct ceph_mds_client *mdsc;
atomic_long_t writeback_count;
+ bool write_congested;
struct workqueue_struct *inode_wq;
struct workqueue_struct *cap_wq;
@@ -230,7 +231,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
- kfree(capsnap);
+ kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
@@ -883,6 +884,8 @@ struct ceph_snap_realm {
struct list_head dirty_item; /* if realm needs new context */
+ struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */
+
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
@@ -938,7 +941,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
-extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc);
extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap);
@@ -1048,6 +1051,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
/* xattr.c */
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
+int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
@@ -1213,7 +1217,7 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
-extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
+extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index fcf7dfdecf96..afec84088471 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -923,10 +923,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
- struct ceph_vxattr *vxattr = NULL;
+ struct ceph_vxattr *vxattr;
int req_mask;
ssize_t err;
+ if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
+ goto handle_non_vxattrs;
+
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
@@ -945,8 +948,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
err = -ERANGE;
}
return err;
+ } else {
+ err = ceph_do_getvxattr(inode, name, value, size);
+ /* this would happen with a new client and old server combo */
+ if (err == -EOPNOTSUPP)
+ err = -ENODATA;
+ return err;
}
-
+handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c
index 463ebe34892b..180c234c2f46 100644
--- a/fs/cifs/cifs_swn.c
+++ b/fs/cifs/cifs_swn.c
@@ -396,11 +396,11 @@ static int cifs_swn_resource_state_changed(struct cifs_swn_reg *swnreg, const ch
switch (state) {
case CIFS_SWN_RESOURCE_STATE_UNAVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become unavailable\n", __func__, name);
- cifs_reconnect(swnreg->tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_AVAILABLE:
cifs_dbg(FYI, "%s: resource name '%s' become available\n", __func__, name);
- cifs_reconnect(swnreg->tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(swnreg->tcon->ses->server, true);
break;
case CIFS_SWN_RESOURCE_STATE_UNKNOWN:
cifs_dbg(FYI, "%s: resource name '%s' changed to unknown state\n", __func__, name);
@@ -498,7 +498,7 @@ static int cifs_swn_reconnect(struct cifs_tcon *tcon, struct sockaddr_storage *a
goto unlock;
}
- cifs_reconnect(tcon->ses->server, false);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, false);
unlock:
mutex_unlock(&tcon->ses->server->srv_mutex);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index ee3aab3dd4ac..bf861fef2f0c 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -949,6 +949,9 @@ static void populate_new_aces(char *nacl_base,
pnntace = (struct cifs_ace *) (nacl_base + nsize);
nsize += setup_special_mode_ACE(pnntace, nmode);
num_aces++;
+ pnntace = (struct cifs_ace *) (nacl_base + nsize);
+ nsize += setup_authusers_ACE(pnntace);
+ num_aces++;
goto set_size;
}
@@ -1297,7 +1300,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
if (uid_valid(uid)) { /* chown */
uid_t id;
- nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!nowner_sid_ptr) {
rc = -ENOMEM;
@@ -1326,7 +1329,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
}
if (gid_valid(gid)) { /* chgrp */
gid_t id;
- ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid),
+ ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid),
GFP_KERNEL);
if (!ngroup_sid_ptr) {
rc = -ENOMEM;
@@ -1613,7 +1616,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
nsecdesclen = secdesclen;
if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */
if (mode_from_sid)
- nsecdesclen += sizeof(struct cifs_ace);
+ nsecdesclen += 2 * sizeof(struct cifs_ace);
else /* cifsacl */
nsecdesclen += 5 * sizeof(struct cifs_ace);
} else { /* chown */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 199edac0cb59..d1211ad4e85b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -210,6 +210,9 @@ cifs_read_super(struct super_block *sb)
if (rc)
goto out_no_root;
/* tune readahead according to rsize if readahead size not set on mount */
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ tcon->ses->server->ops->negotiate_rsize(tcon, cifs_sb->ctx);
if (cifs_sb->ctx->rasize)
sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE;
else
@@ -254,6 +257,9 @@ static void cifs_kill_sb(struct super_block *sb)
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct cached_fid *cfid;
+ struct rb_root *root = &cifs_sb->tlink_tree;
+ struct rb_node *node;
+ struct tcon_link *tlink;
/*
* We ned to release all dentries for the cached directories
@@ -263,16 +269,18 @@ static void cifs_kill_sb(struct super_block *sb)
dput(cifs_sb->root);
cifs_sb->root = NULL;
}
- tcon = cifs_sb_master_tcon(cifs_sb);
- if (tcon) {
+ node = rb_first(root);
+ while (node != NULL) {
+ tlink = rb_entry(node, struct tcon_link, tl_rbnode);
+ tcon = tlink_tcon(tlink);
cfid = &tcon->crfid;
mutex_lock(&cfid->fid_mutex);
if (cfid->dentry) {
-
dput(cfid->dentry);
cfid->dentry = NULL;
}
mutex_unlock(&cfid->fid_mutex);
+ node = rb_next(node);
}
kill_anon_super(sb);
@@ -354,7 +362,7 @@ static struct inode *
cifs_alloc_inode(struct super_block *sb)
{
struct cifsInodeInfo *cifs_inode;
- cifs_inode = kmem_cache_alloc(cifs_inode_cachep, GFP_KERNEL);
+ cifs_inode = alloc_inode_sb(sb, cifs_inode_cachep, GFP_KERNEL);
if (!cifs_inode)
return NULL;
cifs_inode->cifsAttrs = 0x20; /* default */
@@ -919,6 +927,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
out_super:
deactivate_locked_super(sb);
+ return root;
out:
if (cifs_sb) {
kfree(cifs_sb->prepath);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d3701295402d..0df3b24a0bf4 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -132,6 +132,9 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *out_buf,
int *bytes_returned);
void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels);
+void
cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
bool mark_smb_session);
extern int cifs_reconnect(struct TCP_Server_Info *server,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 11a22a30ee14..9964c3634322 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -162,11 +162,51 @@ static void cifs_resolve_server(struct work_struct *work)
mutex_unlock(&server->srv_mutex);
}
-/**
+/*
+ * Update the tcpStatus for the server.
+ * This is used to signal the cifsd thread to call cifs_reconnect
+ * ONLY cifsd thread should call cifs_reconnect. For any other
+ * thread, use this function
+ *
+ * @server: the tcp ses for which reconnect is needed
+ * @all_channels: if this needs to be done for all channels
+ */
+void
+cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
+ bool all_channels)
+{
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ int i;
+
+ /* If server is a channel, select the primary channel */
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ if (!all_channels) {
+ pserver->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&cifs_tcp_ses_lock);
+ return;
+ }
+
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->chan_lock);
+ for (i = 0; i < ses->chan_count; i++)
+ ses->chans[i].server->tcpStatus = CifsNeedReconnect;
+ spin_unlock(&ses->chan_lock);
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+}
+
+/*
* Mark all sessions and tcons for reconnect.
+ * IMPORTANT: make sure that this gets called only from
+ * cifsd thread. For any other thread, use
+ * cifs_signal_cifsd_for_reconnect
*
+ * @server: the tcp ses for which reconnect is needed
* @server needs to be previously set to CifsNeedReconnect.
- *
+ * @mark_smb_session: whether even sessions need to be marked
*/
void
cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
@@ -175,11 +215,6 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
struct TCP_Server_Info *pserver;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- struct mid_q_entry *mid, *nmid;
- struct list_head retry_list;
-
- server->maxBuf = 0;
- server->max_read = 0;
/*
* before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they
@@ -219,6 +254,16 @@ next_session:
spin_unlock(&ses->chan_lock);
}
spin_unlock(&cifs_tcp_ses_lock);
+}
+
+static void
+cifs_abort_connection(struct TCP_Server_Info *server)
+{
+ struct mid_q_entry *mid, *nmid;
+ struct list_head retry_list;
+
+ server->maxBuf = 0;
+ server->max_read = 0;
/* do not want to be sending data on a socket we are freeing */
cifs_dbg(FYI, "%s: tearing down socket\n", __func__);
@@ -310,6 +355,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
+ cifs_abort_connection(server);
+
do {
try_to_freeze();
mutex_lock(&server->srv_mutex);
@@ -434,6 +481,8 @@ reconnect_dfs_server(struct TCP_Server_Info *server,
cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
+ cifs_abort_connection(server);
+
do {
try_to_freeze();
mutex_lock(&server->srv_mutex);
@@ -639,6 +688,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
if (server->tcpStatus == CifsNeedReconnect) {
spin_unlock(&cifs_tcp_ses_lock);
+ cifs_reconnect(server, false);
return -ECONNABORTED;
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -1831,13 +1881,9 @@ void cifs_put_smb_ses(struct cifs_ses *ses)
int i;
for (i = 1; i < chan_count; i++) {
- /*
- * note: for now, we're okay accessing ses->chans
- * without chan_lock. But when chans can go away, we'll
- * need to introduce ref counting to make sure that chan
- * is not freed from under us.
- */
+ spin_unlock(&ses->chan_lock);
cifs_put_tcp_session(ses->chans[i].server, 0);
+ spin_lock(&ses->chan_lock);
ses->chans[i].server = NULL;
}
}
@@ -1981,6 +2027,19 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
}
}
+ ctx->workstation_name = kstrdup(ses->workstation_name, GFP_KERNEL);
+ if (!ctx->workstation_name) {
+ cifs_dbg(FYI, "Unable to allocate memory for workstation_name\n");
+ rc = -ENOMEM;
+ kfree(ctx->username);
+ ctx->username = NULL;
+ kfree_sensitive(ctx->password);
+ ctx->password = NULL;
+ kfree(ctx->domainname);
+ ctx->domainname = NULL;
+ goto out_key_put;
+ }
+
out_key_put:
up_read(&key->sem);
key_put(key);
@@ -2331,10 +2390,19 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
if (ses->server->posix_ext_supported) {
tcon->posix_extensions = true;
pr_warn_once("SMB3.11 POSIX Extensions are experimental\n");
- } else {
+ } else if ((ses->server->vals->protocol_id == SMB311_PROT_ID) ||
+ (strcmp(ses->server->vals->version_string,
+ SMB3ANY_VERSION_STRING) == 0) ||
+ (strcmp(ses->server->vals->version_string,
+ SMBDEFAULT_VERSION_STRING) == 0)) {
cifs_dbg(VFS, "Server does not support mounting with posix SMB3.11 extensions\n");
rc = -EOPNOTSUPP;
goto out_fail;
+ } else {
+ cifs_dbg(VFS, "Check vers= mount option. SMB3.11 "
+ "disabled but required for POSIX extensions\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
}
}
@@ -3896,7 +3964,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
/* only send once per connect */
spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus != CifsNeedSessSetup) {
+ if ((server->tcpStatus != CifsNeedSessSetup) &&
+ (ses->status == CifsGood)) {
spin_unlock(&cifs_tcp_ses_lock);
return 0;
}
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index dd9643751671..30e040da4f09 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1355,7 +1355,7 @@ static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cach
}
cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
- cifs_reconnect(tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(tcon->ses->server, true);
}
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 59334be9ed3b..60f43bff7ccb 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3740,6 +3740,11 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
break;
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
@@ -4269,8 +4274,6 @@ cifs_readv_complete(struct work_struct *work)
for (i = 0; i < rdata->nr_pages; i++) {
struct page *page = rdata->pages[i];
- lru_cache_add(page);
-
if (rdata->result == 0 ||
(rdata->result == -EAGAIN && got_bytes)) {
flush_dcache_page(page);
@@ -4278,12 +4281,12 @@ cifs_readv_complete(struct work_struct *work)
} else
SetPageError(page);
- unlock_page(page);
-
if (rdata->result == 0 ||
(rdata->result == -EAGAIN && got_bytes))
cifs_readpage_to_fscache(rdata->mapping->host, page);
+ unlock_page(page);
+
got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
put_page(page);
@@ -4340,7 +4343,6 @@ readpages_fill_pages(struct TCP_Server_Info *server,
* fill them until the writes are flushed.
*/
zero_user(page, 0, PAGE_SIZE);
- lru_cache_add(page);
flush_dcache_page(page);
SetPageUptodate(page);
unlock_page(page);
@@ -4350,7 +4352,6 @@ readpages_fill_pages(struct TCP_Server_Info *server,
continue;
} else {
/* no need to hold page hostage */
- lru_cache_add(page);
unlock_page(page);
put_page(page);
rdata->pages[i] = NULL;
@@ -4393,92 +4394,20 @@ cifs_readpages_copy_into_pages(struct TCP_Server_Info *server,
return readpages_fill_pages(server, rdata, iter, iter->count);
}
-static int
-readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
- unsigned int rsize, struct list_head *tmplist,
- unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
+static void cifs_readahead(struct readahead_control *ractl)
{
- struct page *page, *tpage;
- unsigned int expected_index;
int rc;
- gfp_t gfp = readahead_gfp_mask(mapping);
-
- INIT_LIST_HEAD(tmplist);
-
- page = lru_to_page(page_list);
-
- /*
- * Lock the page and put it in the cache. Since no one else
- * should have access to this page, we're safe to simply set
- * PG_locked without checking it first.
- */
- __SetPageLocked(page);
- rc = add_to_page_cache_locked(page, mapping,
- page->index, gfp);
-
- /* give up if we can't stick it in the cache */
- if (rc) {
- __ClearPageLocked(page);
- return rc;
- }
-
- /* move first page to the tmplist */
- *offset = (loff_t)page->index << PAGE_SHIFT;
- *bytes = PAGE_SIZE;
- *nr_pages = 1;
- list_move_tail(&page->lru, tmplist);
-
- /* now try and add more pages onto the request */
- expected_index = page->index + 1;
- list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
- /* discontinuity ? */
- if (page->index != expected_index)
- break;
-
- /* would this page push the read over the rsize? */
- if (*bytes + PAGE_SIZE > rsize)
- break;
-
- __SetPageLocked(page);
- rc = add_to_page_cache_locked(page, mapping, page->index, gfp);
- if (rc) {
- __ClearPageLocked(page);
- break;
- }
- list_move_tail(&page->lru, tmplist);
- (*bytes) += PAGE_SIZE;
- expected_index++;
- (*nr_pages)++;
- }
- return rc;
-}
-
-static int cifs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *page_list, unsigned num_pages)
-{
- int rc;
- int err = 0;
- struct list_head tmplist;
- struct cifsFileInfo *open_file = file->private_data;
- struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file);
+ struct cifsFileInfo *open_file = ractl->file->private_data;
+ struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(ractl->file);
struct TCP_Server_Info *server;
pid_t pid;
- unsigned int xid;
+ unsigned int xid, nr_pages, last_batch_size = 0, cache_nr_pages = 0;
+ pgoff_t next_cached = ULONG_MAX;
+ bool caching = fscache_cookie_enabled(cifs_inode_cookie(ractl->mapping->host)) &&
+ cifs_inode_cookie(ractl->mapping->host)->cache_priv;
+ bool check_cache = caching;
xid = get_xid();
- /*
- * Reads as many pages as possible from fscache. Returns -ENOBUFS
- * immediately if the cookie is negative
- *
- * After this point, every page in the list might have PG_fscache set,
- * so we will need to clean that up off of every page we don't use.
- */
- rc = cifs_readpages_from_fscache(mapping->host, mapping, page_list,
- &num_pages);
- if (rc == 0) {
- free_xid(xid);
- return rc;
- }
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
pid = open_file->pid;
@@ -4489,39 +4418,78 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
- __func__, file, mapping, num_pages);
+ __func__, ractl->file, ractl->mapping, readahead_count(ractl));
/*
- * Start with the page at end of list and move it to private
- * list. Do the same with any following pages until we hit
- * the rsize limit, hit an index discontinuity, or run out of
- * pages. Issue the async read and then start the loop again
- * until the list is empty.
- *
- * Note that list order is important. The page_list is in
- * the order of declining indexes. When we put the pages in
- * the rdata->pages, then we want them in increasing order.
+ * Chop the readahead request up into rsize-sized read requests.
*/
- while (!list_empty(page_list) && !err) {
- unsigned int i, nr_pages, bytes, rsize;
- loff_t offset;
- struct page *page, *tpage;
+ while ((nr_pages = readahead_count(ractl) - last_batch_size)) {
+ unsigned int i, got, rsize;
+ struct page *page;
struct cifs_readdata *rdata;
struct cifs_credits credits_on_stack;
struct cifs_credits *credits = &credits_on_stack;
+ pgoff_t index = readahead_index(ractl) + last_batch_size;
+
+ /*
+ * Find out if we have anything cached in the range of
+ * interest, and if so, where the next chunk of cached data is.
+ */
+ if (caching) {
+ if (check_cache) {
+ rc = cifs_fscache_query_occupancy(
+ ractl->mapping->host, index, nr_pages,
+ &next_cached, &cache_nr_pages);
+ if (rc < 0)
+ caching = false;
+ check_cache = false;
+ }
+
+ if (index == next_cached) {
+ /*
+ * TODO: Send a whole batch of pages to be read
+ * by the cache.
+ */
+ page = readahead_page(ractl);
+ last_batch_size = 1 << thp_order(page);
+ if (cifs_readpage_from_fscache(ractl->mapping->host,
+ page) < 0) {
+ /*
+ * TODO: Deal with cache read failure
+ * here, but for the moment, delegate
+ * that to readpage.
+ */
+ caching = false;
+ }
+ unlock_page(page);
+ next_cached++;
+ cache_nr_pages--;
+ if (cache_nr_pages == 0)
+ check_cache = true;
+ continue;
+ }
+ }
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, true);
- if (rc == -EAGAIN)
- continue;
- else if (rc)
+ if (rc) {
+ if (rc == -EAGAIN)
+ continue;
break;
+ }
}
+ if (cifs_sb->ctx->rsize == 0)
+ cifs_sb->ctx->rsize =
+ server->ops->negotiate_rsize(tlink_tcon(open_file->tlink),
+ cifs_sb->ctx);
+
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&rsize, credits);
if (rc)
break;
+ nr_pages = min_t(size_t, rsize / PAGE_SIZE, readahead_count(ractl));
+ nr_pages = min_t(size_t, nr_pages, next_cached - index);
/*
* Give up immediately if rsize is too small to read an entire
@@ -4529,16 +4497,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
* reach this point however since we set ra_pages to 0 when the
* rsize is smaller than a cache page.
*/
- if (unlikely(rsize < PAGE_SIZE)) {
- add_credits_and_wake_if(server, credits, 0);
- free_xid(xid);
- return 0;
- }
-
- nr_pages = 0;
- err = readpages_get_pages(mapping, page_list, rsize, &tmplist,
- &nr_pages, &offset, &bytes);
- if (!nr_pages) {
+ if (unlikely(!nr_pages)) {
add_credits_and_wake_if(server, credits, 0);
break;
}
@@ -4546,36 +4505,31 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
if (!rdata) {
/* best to give up if we're out of mem */
- list_for_each_entry_safe(page, tpage, &tmplist, lru) {
- list_del(&page->lru);
- lru_cache_add(page);
- unlock_page(page);
- put_page(page);
- }
- rc = -ENOMEM;
add_credits_and_wake_if(server, credits, 0);
break;
}
- rdata->cfile = cifsFileInfo_get(open_file);
- rdata->server = server;
- rdata->mapping = mapping;
- rdata->offset = offset;
- rdata->bytes = bytes;
- rdata->pid = pid;
- rdata->pagesz = PAGE_SIZE;
- rdata->tailsz = PAGE_SIZE;
+ got = __readahead_batch(ractl, rdata->pages, nr_pages);
+ if (got != nr_pages) {
+ pr_warn("__readahead_batch() returned %u/%u\n",
+ got, nr_pages);
+ nr_pages = got;
+ }
+
+ rdata->nr_pages = nr_pages;
+ rdata->bytes = readahead_batch_length(ractl);
+ rdata->cfile = cifsFileInfo_get(open_file);
+ rdata->server = server;
+ rdata->mapping = ractl->mapping;
+ rdata->offset = readahead_pos(ractl);
+ rdata->pid = pid;
+ rdata->pagesz = PAGE_SIZE;
+ rdata->tailsz = PAGE_SIZE;
rdata->read_into_pages = cifs_readpages_read_into_pages;
rdata->copy_into_pages = cifs_readpages_copy_into_pages;
- rdata->credits = credits_on_stack;
-
- list_for_each_entry_safe(page, tpage, &tmplist, lru) {
- list_del(&page->lru);
- rdata->pages[rdata->nr_pages++] = page;
- }
+ rdata->credits = credits_on_stack;
rc = adjust_credits(server, &rdata->credits, rdata->bytes);
-
if (!rc) {
if (rdata->cfile->invalidHandle)
rc = -EAGAIN;
@@ -4587,7 +4541,6 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
add_credits_and_wake_if(server, &rdata->credits, 0);
for (i = 0; i < rdata->nr_pages; i++) {
page = rdata->pages[i];
- lru_cache_add(page);
unlock_page(page);
put_page(page);
}
@@ -4597,10 +4550,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
}
kref_put(&rdata->refcount, cifs_readdata_release);
+ last_batch_size = nr_pages;
}
free_xid(xid);
- return rc;
}
/*
@@ -4811,17 +4764,17 @@ static int cifs_release_page(struct page *page, gfp_t gfp)
return true;
}
-static void cifs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void cifs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
}
-static int cifs_launder_page(struct page *page)
+static int cifs_launder_folio(struct folio *folio)
{
int rc = 0;
- loff_t range_start = page_offset(page);
- loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
+ loff_t range_start = folio_pos(folio);
+ loff_t range_end = range_start + folio_size(folio);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
@@ -4829,12 +4782,12 @@ static int cifs_launder_page(struct page *page)
.range_end = range_end,
};
- cifs_dbg(FYI, "Launder page: %p\n", page);
+ cifs_dbg(FYI, "Launder page: %lu\n", folio->index);
- if (clear_page_dirty_for_io(page))
- rc = cifs_writepage_locked(page, &wbc);
+ if (folio_clear_dirty_for_io(folio))
+ rc = cifs_writepage_locked(&folio->page, &wbc);
- wait_on_page_fscache(page);
+ folio_wait_fscache(folio);
return rc;
}
@@ -4924,7 +4877,7 @@ oplock_break_done:
* In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
* so this method should never be called.
*
- * Direct IO is not yet supported in the cached mode.
+ * Direct IO is not yet supported in the cached mode.
*/
static ssize_t
cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
@@ -4996,26 +4949,27 @@ static void cifs_swap_deactivate(struct file *file)
* need to pin the cache object to write back to.
*/
#ifdef CONFIG_CIFS_FSCACHE
-static int cifs_set_page_dirty(struct page *page)
+static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- return fscache_set_page_dirty(page, cifs_inode_cookie(page->mapping->host));
+ return fscache_dirty_folio(mapping, folio,
+ cifs_inode_cookie(mapping->host));
}
#else
-#define cifs_set_page_dirty __set_page_dirty_nobuffers
+#define cifs_dirty_folio filemap_dirty_folio
#endif
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
- .readpages = cifs_readpages,
+ .readahead = cifs_readahead,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .set_page_dirty = cifs_set_page_dirty,
+ .dirty_folio = cifs_dirty_folio,
.releasepage = cifs_release_page,
.direct_IO = cifs_direct_io,
- .invalidatepage = cifs_invalidate_page,
- .launder_page = cifs_launder_page,
+ .invalidate_folio = cifs_invalidate_folio,
+ .launder_folio = cifs_launder_folio,
/*
* TODO: investigate and if useful we could add an cifs_migratePage
* helper (under an CONFIG_MIGRATION) in the future, and also
@@ -5036,8 +4990,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .set_page_dirty = cifs_set_page_dirty,
+ .dirty_folio = cifs_dirty_folio,
.releasepage = cifs_release_page,
- .invalidatepage = cifs_invalidate_page,
- .launder_page = cifs_launder_page,
+ .invalidate_folio = cifs_invalidate_folio,
+ .launder_folio = cifs_launder_folio,
};
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 7ec35f3f0a5f..a92e9eec521f 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -149,7 +149,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("echo_interval", Opt_echo_interval),
fsparam_u32("max_credits", Opt_max_credits),
fsparam_u32("handletimeout", Opt_handletimeout),
- fsparam_u32("snapshot", Opt_snapshot),
+ fsparam_u64("snapshot", Opt_snapshot),
fsparam_u32("max_channels", Opt_max_channels),
/* Mount options which take string value */
@@ -1078,7 +1078,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
- ctx->snapshot_time = result.uint_32;
+ ctx->snapshot_time = result.uint_64;
break;
case Opt_max_credits:
if (result.uint_32 < 20 || result.uint_32 > 60000) {
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index efaac4d5ff55..33af72e0ac0c 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -134,37 +134,127 @@ void cifs_fscache_release_inode_cookie(struct inode *inode)
}
}
+static inline void fscache_end_operation(struct netfs_cache_resources *cres)
+{
+ const struct netfs_cache_ops *ops = fscache_operation_valid(cres);
+
+ if (ops)
+ ops->end_operation(cres);
+}
+
/*
- * Retrieve a page from FS-Cache
+ * Fallback page reading interface.
*/
-int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+static int fscache_fallback_read_page(struct inode *inode, struct page *page)
{
- cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
- __func__, CIFS_I(inode)->fscache, page, inode);
- return -ENOBUFS; // Needs conversion to using netfslib
+ struct netfs_cache_resources cres;
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
+ struct iov_iter iter;
+ struct bio_vec bvec[1];
+ int ret;
+
+ memset(&cres, 0, sizeof(cres));
+ bvec[0].bv_page = page;
+ bvec[0].bv_offset = 0;
+ bvec[0].bv_len = PAGE_SIZE;
+ iov_iter_bvec(&iter, READ, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+
+ ret = fscache_begin_read_operation(&cres, cookie);
+ if (ret < 0)
+ return ret;
+
+ ret = fscache_read(&cres, page_offset(page), &iter, NETFS_READ_HOLE_FAIL,
+ NULL, NULL);
+ fscache_end_operation(&cres);
+ return ret;
}
/*
- * Retrieve a set of pages from FS-Cache
+ * Fallback page writing interface.
*/
-int __cifs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+static int fscache_fallback_write_page(struct inode *inode, struct page *page,
+ bool no_space_allocated_yet)
{
- cifs_dbg(FYI, "%s: (0x%p/%u/0x%p)\n",
- __func__, CIFS_I(inode)->fscache, *nr_pages, inode);
- return -ENOBUFS; // Needs conversion to using netfslib
+ struct netfs_cache_resources cres;
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
+ struct iov_iter iter;
+ struct bio_vec bvec[1];
+ loff_t start = page_offset(page);
+ size_t len = PAGE_SIZE;
+ int ret;
+
+ memset(&cres, 0, sizeof(cres));
+ bvec[0].bv_page = page;
+ bvec[0].bv_offset = 0;
+ bvec[0].bv_len = PAGE_SIZE;
+ iov_iter_bvec(&iter, WRITE, bvec, ARRAY_SIZE(bvec), PAGE_SIZE);
+
+ ret = fscache_begin_write_operation(&cres, cookie);
+ if (ret < 0)
+ return ret;
+
+ ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+ no_space_allocated_yet);
+ if (ret == 0)
+ ret = fscache_write(&cres, page_offset(page), &iter, NULL, NULL);
+ fscache_end_operation(&cres);
+ return ret;
}
-void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+/*
+ * Retrieve a page from FS-Cache
+ */
+int __cifs_readpage_from_fscache(struct inode *inode, struct page *page)
{
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ int ret;
- WARN_ON(!cifsi->fscache);
+ cifs_dbg(FYI, "%s: (fsc:%p, p:%p, i:0x%p\n",
+ __func__, cifs_inode_cookie(inode), page, inode);
+ ret = fscache_fallback_read_page(inode, page);
+ if (ret < 0)
+ return ret;
+
+ /* Read completed synchronously */
+ SetPageUptodate(page);
+ return 0;
+}
+
+void __cifs_readpage_to_fscache(struct inode *inode, struct page *page)
+{
cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n",
- __func__, cifsi->fscache, page, inode);
+ __func__, cifs_inode_cookie(inode), page, inode);
+
+ fscache_fallback_write_page(inode, page, true);
+}
+
+/*
+ * Query the cache occupancy.
+ */
+int __cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages)
+{
+ struct netfs_cache_resources cres;
+ struct fscache_cookie *cookie = cifs_inode_cookie(inode);
+ loff_t start, data_start;
+ size_t len, data_len;
+ int ret;
- // Needs conversion to using netfslib
+ ret = fscache_begin_read_operation(&cres, cookie);
+ if (ret < 0)
+ return ret;
+
+ start = first * PAGE_SIZE;
+ len = nr_pages * PAGE_SIZE;
+ ret = cres.ops->query_occupancy(&cres, start, len, PAGE_SIZE,
+ &data_start, &data_len);
+ if (ret == 0) {
+ *_data_first = data_start / PAGE_SIZE;
+ *_data_nr_pages = len / PAGE_SIZE;
+ }
+
+ fscache_end_operation(&cres);
+ return ret;
}
diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h
index c6ca49ac33d4..55129908e2c1 100644
--- a/fs/cifs/fscache.h
+++ b/fs/cifs/fscache.h
@@ -9,6 +9,7 @@
#ifndef _CIFS_FSCACHE_H
#define _CIFS_FSCACHE_H
+#include <linux/swap.h>
#include <linux/fscache.h>
#include "cifsglob.h"
@@ -58,14 +59,6 @@ void cifs_fscache_fill_coherency(struct inode *inode,
}
-extern int cifs_fscache_release_page(struct page *page, gfp_t gfp);
-extern int __cifs_readpage_from_fscache(struct inode *, struct page *);
-extern int __cifs_readpages_from_fscache(struct inode *,
- struct address_space *,
- struct list_head *,
- unsigned *);
-extern void __cifs_readpage_to_fscache(struct inode *, struct page *);
-
static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode)
{
return CIFS_I(inode)->fscache;
@@ -80,33 +73,52 @@ static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags
i_size_read(inode), flags);
}
-static inline int cifs_readpage_from_fscache(struct inode *inode,
- struct page *page)
-{
- if (CIFS_I(inode)->fscache)
- return __cifs_readpage_from_fscache(inode, page);
+extern int __cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages);
- return -ENOBUFS;
+static inline int cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages)
+{
+ if (!cifs_inode_cookie(inode))
+ return -ENOBUFS;
+ return __cifs_fscache_query_occupancy(inode, first, nr_pages,
+ _data_first, _data_nr_pages);
}
-static inline int cifs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+extern int __cifs_readpage_from_fscache(struct inode *pinode, struct page *ppage);
+extern void __cifs_readpage_to_fscache(struct inode *pinode, struct page *ppage);
+
+
+static inline int cifs_readpage_from_fscache(struct inode *inode,
+ struct page *page)
{
- if (CIFS_I(inode)->fscache)
- return __cifs_readpages_from_fscache(inode, mapping, pages,
- nr_pages);
+ if (cifs_inode_cookie(inode))
+ return __cifs_readpage_from_fscache(inode, page);
return -ENOBUFS;
}
static inline void cifs_readpage_to_fscache(struct inode *inode,
struct page *page)
{
- if (PageFsCache(page))
+ if (cifs_inode_cookie(inode))
__cifs_readpage_to_fscache(inode, page);
}
+static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ if (PageFsCache(page)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ wait_on_page_fscache(page);
+ fscache_note_page_release(cifs_inode_cookie(page->mapping->host));
+ }
+ return true;
+}
+
#else /* CONFIG_CIFS_FSCACHE */
static inline
void cifs_fscache_fill_coherency(struct inode *inode,
@@ -123,22 +135,29 @@ static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool upd
static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; }
static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {}
-static inline int
-cifs_readpage_from_fscache(struct inode *inode, struct page *page)
+static inline int cifs_fscache_query_occupancy(struct inode *inode,
+ pgoff_t first, unsigned int nr_pages,
+ pgoff_t *_data_first,
+ unsigned int *_data_nr_pages)
{
+ *_data_first = ULONG_MAX;
+ *_data_nr_pages = 0;
return -ENOBUFS;
}
-static inline int cifs_readpages_from_fscache(struct inode *inode,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned *nr_pages)
+static inline int
+cifs_readpage_from_fscache(struct inode *inode, struct page *page)
{
return -ENOBUFS;
}
-static inline void cifs_readpage_to_fscache(struct inode *inode,
- struct page *page) {}
+static inline
+void cifs_readpage_to_fscache(struct inode *inode, struct page *page) {}
+
+static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
+{
+ return true; /* May release page */
+}
#endif /* CONFIG_CIFS_FSCACHE */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7d8b3ceb2af3..60d853c92f6a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -83,6 +83,7 @@ static void cifs_set_ops(struct inode *inode)
static void
cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
{
+ struct cifs_fscache_inode_coherency_data cd;
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
cifs_dbg(FYI, "%s: revalidating inode %llu\n",
@@ -113,6 +114,9 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n",
__func__, cifs_i->uniqueid);
set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
+ /* Invalidate fscache cookie */
+ cifs_fscache_fill_coherency(&cifs_i->vfs_inode, &cd);
+ fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0);
}
/*
@@ -2261,8 +2265,6 @@ cifs_dentry_needs_reval(struct dentry *dentry)
int
cifs_invalidate_mapping(struct inode *inode)
{
- struct cifs_fscache_inode_coherency_data cd;
- struct cifsInodeInfo *cifsi = CIFS_I(inode);
int rc = 0;
if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
@@ -2272,8 +2274,6 @@ cifs_invalidate_mapping(struct inode *inode)
__func__, inode);
}
- cifs_fscache_fill_coherency(&cifsi->vfs_inode, &cd);
- fscache_invalidate(cifs_inode_cookie(inode), &cd, i_size_read(inode), 0);
return rc;
}
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 298458404252..55758b9ec877 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -107,7 +107,7 @@ struct negotiate_message {
SECURITY_BUFFER WorkstationName; /* RFC 1001 and ASCII */
struct ntlmssp_version Version;
/* SECURITY_BUFFER */
- char DomainString[0];
+ char DomainString[];
/* followed by WorkstationString */
} __packed;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index dc3b16d1be09..32f478c7a66d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -127,11 +127,6 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
struct cifs_server_iface *ifaces = NULL;
size_t iface_count;
- if (ses->server->dialect < SMB30_PROT_ID) {
- cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n");
- return 0;
- }
-
spin_lock(&ses->chan_lock);
new_chan_count = old_chan_count = ses->chan_count;
@@ -145,6 +140,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses)
return 0;
}
+ if (ses->server->dialect < SMB30_PROT_ID) {
+ spin_unlock(&ses->chan_lock);
+ cifs_dbg(VFS, "multichannel is not supported on this protocol version, use 3.0 or above\n");
+ return 0;
+ }
+
if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
ses->chan_max = 1;
spin_unlock(&ses->chan_lock);
@@ -713,7 +714,11 @@ static int size_of_ntlmssp_blob(struct cifs_ses *ses, int base_size)
else
sz += sizeof(__le16);
- sz += sizeof(__le16) * strnlen(ses->workstation_name, CIFS_MAX_WORKSTATION_LEN);
+ if (ses->workstation_name)
+ sz += sizeof(__le16) * strnlen(ses->workstation_name,
+ CIFS_MAX_WORKSTATION_LEN);
+ else
+ sz += sizeof(__le16);
return sz;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 8272c91e15ef..c71c9a44bef4 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -228,9 +228,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
spin_unlock(&GlobalMid_Lock);
if (reconnect) {
- spin_lock(&cifs_tcp_ses_lock);
- server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&cifs_tcp_ses_lock);
+ cifs_signal_cifsd_for_reconnect(server, false);
}
return mid;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index af5d0830bc8a..891b11576e55 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -25,6 +25,7 @@
#include "smb2glob.h"
#include "cifs_ioctl.h"
#include "smbdirect.h"
+#include "fscache.h"
#include "fs_context.h"
/* Change credits for different ops and return the total number of credits */
@@ -3887,29 +3888,38 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
{
int rc;
unsigned int xid;
+ struct inode *inode;
struct cifsFileInfo *cfile = file->private_data;
+ struct cifsInodeInfo *cifsi;
__le64 eof;
xid = get_xid();
- if (off >= i_size_read(file->f_inode) ||
- off + len >= i_size_read(file->f_inode)) {
+ inode = d_inode(cfile->dentry);
+ cifsi = CIFS_I(inode);
+
+ if (off >= i_size_read(inode) ||
+ off + len >= i_size_read(inode)) {
rc = -EINVAL;
goto out;
}
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
- i_size_read(file->f_inode) - off - len, off);
+ i_size_read(inode) - off - len, off);
if (rc < 0)
goto out;
- eof = cpu_to_le64(i_size_read(file->f_inode) - len);
+ eof = cpu_to_le64(i_size_read(inode) - len);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, cfile->pid, &eof);
if (rc < 0)
goto out;
rc = 0;
+
+ cifsi->server_eof = i_size_read(inode) - len;
+ truncate_setsize(inode, cifsi->server_eof);
+ fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
out:
free_xid(xid);
return rc;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 8540f7c13eae..eeb1a699bd6f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -430,10 +430,7 @@ unmask:
* be taken as the remainder of this one. We need to kill the
* socket so the server throws away the partial SMB
*/
- spin_lock(&cifs_tcp_ses_lock);
- if (server->tcpStatus != CifsExiting)
- server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&cifs_tcp_ses_lock);
+ cifs_signal_cifsd_for_reconnect(server, false);
trace_smb3_partial_send_reconnect(server->CurrentMid,
server->conn_id, server->hostname);
}
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 7d8b72d67c80..9d486fbbfbbd 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -175,11 +175,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
switch (handler->flags) {
case XATTR_CIFS_NTSD_FULL:
aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP |
CIFS_ACL_DACL |
CIFS_ACL_SACL);
break;
case XATTR_CIFS_NTSD:
aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP |
CIFS_ACL_DACL);
break;
case XATTR_CIFS_ACL:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 29dd87be2fb8..3f3c81e6b1ab 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -14,6 +14,7 @@
#include <linux/time.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/pagemap.h>
#include <linux/stat.h>
#include <linux/cred.h>
#include <linux/errno.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d9f1bd7153df..2185328b65c7 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -43,7 +43,7 @@ static struct kmem_cache * coda_inode_cachep;
static struct inode *coda_alloc_inode(struct super_block *sb)
{
struct coda_inode_info *ei;
- ei = kmem_cache_alloc(coda_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, coda_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
memset(&ei->c_fid, 0, sizeof(struct CodaFid));
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 95e72d271b95..8f0af4f62631 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -135,6 +135,8 @@
#define elf_format compat_elf_format
#define init_elf_binfmt init_compat_elf_binfmt
#define exit_elf_binfmt exit_compat_elf_binfmt
+#define binfmt_elf_test_cases compat_binfmt_elf_test_cases
+#define binfmt_elf_test_suite compat_binfmt_elf_test_suite
/*
* We share all the actual code with the native (64-bit) version.
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d3cd2a94d1e8..d1f9d2632202 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -34,6 +34,14 @@
*/
DEFINE_SPINLOCK(configfs_dirent_lock);
+/*
+ * All of link_obj/unlink_obj/link_group/unlink_group require that
+ * subsys->su_mutex is held.
+ * But parent configfs_subsystem is NULL when config_item is root.
+ * Use this mutex when config_item is root.
+ */
+static DEFINE_MUTEX(configfs_subsystem_mutex);
+
static void configfs_d_iput(struct dentry * dentry,
struct inode * inode)
{
@@ -1859,7 +1867,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
group->cg_item.ci_name = group->cg_item.ci_namebuf;
sd = root->d_fsdata;
+ mutex_lock(&configfs_subsystem_mutex);
link_group(to_config_group(sd->s_element), group);
+ mutex_unlock(&configfs_subsystem_mutex);
inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
@@ -1884,7 +1894,9 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
inode_unlock(d_inode(root));
if (err) {
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
put_fragment(frag);
@@ -1931,7 +1943,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
dput(dentry);
+ mutex_lock(&configfs_subsystem_mutex);
unlink_group(group);
+ mutex_unlock(&configfs_subsystem_mutex);
configfs_release_fs();
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 1c060c0a2d72..7ed7d601e5e0 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -42,6 +42,7 @@
#include <linux/path.h>
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
+#include <linux/elf.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -53,6 +54,9 @@
#include <trace/events/sched.h>
+static bool dump_vma_snapshot(struct coredump_params *cprm);
+static void free_vma_snapshot(struct coredump_params *cprm);
+
static int core_uses_pid;
static unsigned int core_pipe_limit;
static char core_pattern[CORENAME_MAX_SIZE] = "core";
@@ -531,6 +535,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
* by any locks.
*/
.mm_flags = mm->flags,
+ .vma_meta = NULL,
};
audit_core_dumps(siginfo->si_signo);
@@ -745,6 +750,9 @@ void do_coredump(const kernel_siginfo_t *siginfo)
pr_info("Core dump to |%s disabled\n", cn.corename);
goto close_fail;
}
+ if (!dump_vma_snapshot(&cprm))
+ goto close_fail;
+
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
/*
@@ -758,6 +766,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
dump_emit(&cprm, "", 1);
}
file_end_write(cprm.file);
+ free_vma_snapshot(&cprm);
}
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
@@ -980,6 +989,8 @@ static bool always_dump_vma(struct vm_area_struct *vma)
return false;
}
+#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
+
/*
* Decide how much of @vma's contents should be included in a core dump.
*/
@@ -1039,9 +1050,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
* dump the first page to aid in determining what was mapped here.
*/
if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
- (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
- return PAGE_SIZE;
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+ if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+ /*
+ * ELF libraries aren't always executable.
+ * We'll want to check whether the mapping starts with the ELF
+ * magic, but not now - we're holding the mmap lock,
+ * so copy_from_user() doesn't work here.
+ * Use a placeholder instead, and fix it up later in
+ * dump_vma_snapshot().
+ */
+ return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
+ }
#undef FILTER
@@ -1078,18 +1100,29 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
return gate_vma;
}
+static void free_vma_snapshot(struct coredump_params *cprm)
+{
+ if (cprm->vma_meta) {
+ int i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct file *file = cprm->vma_meta[i].file;
+ if (file)
+ fput(file);
+ }
+ kvfree(cprm->vma_meta);
+ cprm->vma_meta = NULL;
+ }
+}
+
/*
* Under the mmap_lock, take a snapshot of relevant information about the task's
* VMAs.
*/
-int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
- struct core_vma_metadata **vma_meta,
- size_t *vma_data_size_ptr)
+static bool dump_vma_snapshot(struct coredump_params *cprm)
{
struct vm_area_struct *vma, *gate_vma;
struct mm_struct *mm = current->mm;
int i;
- size_t vma_data_size = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1097,36 +1130,51 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
* mmap_lock in read mode.
*/
if (mmap_write_lock_killable(mm))
- return -EINTR;
+ return false;
+ cprm->vma_data_size = 0;
gate_vma = get_gate_vma(mm);
- *vma_count = mm->map_count + (gate_vma ? 1 : 0);
+ cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0);
- *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
- if (!*vma_meta) {
+ cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL);
+ if (!cprm->vma_meta) {
mmap_write_unlock(mm);
- return -ENOMEM;
+ return false;
}
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma), i++) {
- struct core_vma_metadata *m = (*vma_meta) + i;
+ struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
m->end = vma->vm_end;
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+ m->pgoff = vma->vm_pgoff;
- vma_data_size += m->dump_size;
+ m->file = vma->vm_file;
+ if (m->file)
+ get_file(m->file);
}
mmap_write_unlock(mm);
- if (WARN_ON(i != *vma_count)) {
- kvfree(*vma_meta);
- return -EFAULT;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = cprm->vma_meta + i;
+
+ if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
+ char elfmag[SELFMAG];
+
+ if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
+ memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
+ m->dump_size = 0;
+ } else {
+ m->dump_size = PAGE_SIZE;
+ }
+ }
+
+ cprm->vma_data_size += m->dump_size;
}
- *vma_data_size_ptr = vma_data_size;
- return 0;
+ return true;
}
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 4ef3f714046a..4fcca79f39ae 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -69,6 +69,14 @@ void fscrypt_free_bounce_page(struct page *bounce_page)
}
EXPORT_SYMBOL(fscrypt_free_bounce_page);
+/*
+ * Generate the IV for the given logical block number within the given file.
+ * For filenames encryption, lblk_num == 0.
+ *
+ * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks()
+ * needs to know about any IV generation methods where the low bits of IV don't
+ * simply contain the lblk_num (e.g., IV_INO_LBLK_32).
+ */
void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
const struct fscrypt_info *ci)
{
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index c57bebfa48fe..93c2ca858092 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -17,6 +17,7 @@
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
+#include <linux/uio.h>
#include "fscrypt_private.h"
@@ -315,6 +316,10 @@ EXPORT_SYMBOL_GPL(fscrypt_set_bio_crypt_ctx_bh);
*
* fscrypt_set_bio_crypt_ctx() must have already been called on the bio.
*
+ * This function isn't required in cases where crypto-mergeability is ensured in
+ * another way, such as I/O targeting only a single file (and thus a single key)
+ * combined with fscrypt_limit_io_blocks() to ensure DUN contiguity.
+ *
* Return: true iff the I/O is mergeable
*/
bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
@@ -363,3 +368,91 @@ bool fscrypt_mergeable_bio_bh(struct bio *bio,
return fscrypt_mergeable_bio(bio, inode, next_lblk);
}
EXPORT_SYMBOL_GPL(fscrypt_mergeable_bio_bh);
+
+/**
+ * fscrypt_dio_supported() - check whether a DIO (direct I/O) request is
+ * supported as far as encryption is concerned
+ * @iocb: the file and position the I/O is targeting
+ * @iter: the I/O data segment(s)
+ *
+ * Return: %true if there are no encryption constraints that prevent DIO from
+ * being supported; %false if DIO is unsupported. (Note that in the
+ * %true case, the filesystem might have other, non-encryption-related
+ * constraints that prevent DIO from actually being supported.)
+ */
+bool fscrypt_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
+{
+ const struct inode *inode = file_inode(iocb->ki_filp);
+ const unsigned int blocksize = i_blocksize(inode);
+
+ /* If the file is unencrypted, no veto from us. */
+ if (!fscrypt_needs_contents_encryption(inode))
+ return true;
+
+ /* We only support DIO with inline crypto, not fs-layer crypto. */
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return false;
+
+ /*
+ * Since the granularity of encryption is filesystem blocks, the file
+ * position and total I/O length must be aligned to the filesystem block
+ * size -- not just to the block device's logical block size as is
+ * traditionally the case for DIO on many filesystems.
+ *
+ * We require that the user-provided memory buffers be filesystem block
+ * aligned too. It is simpler to have a single alignment value required
+ * for all properties of the I/O, as is normally the case for DIO.
+ * Also, allowing less aligned buffers would imply that data units could
+ * cross bvecs, which would greatly complicate the I/O stack, which
+ * assumes that bios can be split at any bvec boundary.
+ */
+ if (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), blocksize))
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(fscrypt_dio_supported);
+
+/**
+ * fscrypt_limit_io_blocks() - limit I/O blocks to avoid discontiguous DUNs
+ * @inode: the file on which I/O is being done
+ * @lblk: the block at which the I/O is being started from
+ * @nr_blocks: the number of blocks we want to submit starting at @lblk
+ *
+ * Determine the limit to the number of blocks that can be submitted in a bio
+ * targeting @lblk without causing a data unit number (DUN) discontiguity.
+ *
+ * This is normally just @nr_blocks, as normally the DUNs just increment along
+ * with the logical blocks. (Or the file is not encrypted.)
+ *
+ * In rare cases, fscrypt can be using an IV generation method that allows the
+ * DUN to wrap around within logically contiguous blocks, and that wraparound
+ * will occur. If this happens, a value less than @nr_blocks will be returned
+ * so that the wraparound doesn't occur in the middle of a bio, which would
+ * cause encryption/decryption to produce wrong results.
+ *
+ * Return: the actual number of blocks that can be submitted
+ */
+u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks)
+{
+ const struct fscrypt_info *ci;
+ u32 dun;
+
+ if (!fscrypt_inode_uses_inline_crypto(inode))
+ return nr_blocks;
+
+ if (nr_blocks <= 1)
+ return nr_blocks;
+
+ ci = inode->i_crypt_info;
+ if (!(fscrypt_policy_flags(&ci->ci_policy) &
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+ return nr_blocks;
+
+ /* With IV_INO_LBLK_32, the DUN can wrap around from U32_MAX to 0. */
+
+ dun = ci->ci_hashed_ino + lblk;
+
+ return min_t(u64, nr_blocks, (u64)U32_MAX + 1 - dun);
+}
+EXPORT_SYMBOL_GPL(fscrypt_limit_io_blocks);
diff --git a/fs/dax.c b/fs/dax.c
index ab0978739eaa..67a08a32fccb 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -389,7 +389,7 @@ static struct page *dax_busy_page(void *entry)
}
/*
- * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page
+ * dax_lock_page - Lock the DAX entry corresponding to a page
* @page: The page whose entry we want to lock
*
* Context: Process context.
diff --git a/fs/dcache.c b/fs/dcache.c
index c84269c6e8bf..93f4f5ee07bf 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1766,7 +1766,8 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
char *dname;
int err;
- dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
+ dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
+ GFP_KERNEL);
if (!dentry)
return NULL;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 38bca4980a1c..aef06e607b40 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -402,9 +402,6 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_end_io = dio_bio_end_aio;
else
bio->bi_end_io = dio_bio_end_io;
-
- bio->bi_write_hint = dio->iocb->ki_hint;
-
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 7d85e64ea62f..9ad61b582f07 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -540,12 +540,13 @@ const struct address_space_operations ecryptfs_aops = {
* XXX: This is pretty broken for multiple reasons: ecryptfs does not
* actually use buffer_heads, and ecryptfs will crash without
* CONFIG_BLOCK. But it matches the behavior before the default for
- * address_space_operations without the ->set_page_dirty method was
+ * address_space_operations without the ->dirty_folio method was
* cleaned up, so this is the best we can do without maintainer
* feedback.
*/
#ifdef CONFIG_BLOCK
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
#endif
.writepage = ecryptfs_writepage,
.readpage = ecryptfs_readpage,
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 39116af0390f..0b1c878317ab 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -38,7 +38,7 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
struct ecryptfs_inode_info *inode_info;
struct inode *inode = NULL;
- inode_info = kmem_cache_alloc(ecryptfs_inode_info_cache, GFP_KERNEL);
+ inode_info = alloc_inode_sb(sb, ecryptfs_inode_info_cache, GFP_KERNEL);
if (unlikely(!inode_info))
goto out;
if (ecryptfs_init_crypt_stat(&inode_info->crypt_stat)) {
diff --git a/fs/efs/super.c b/fs/efs/super.c
index 62b155b9366b..b287f47c165b 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -69,7 +69,7 @@ static struct kmem_cache * efs_inode_cachep;
static struct inode *efs_alloc_inode(struct super_block *sb)
{
struct efs_inode_info *ei;
- ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fa7ddb7ad980..780db1e5f4b7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -28,10 +28,10 @@ void erofs_put_metabuf(struct erofs_buf *buf)
buf->page = NULL;
}
-void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_blk_t blkaddr, enum erofs_kmap_type type)
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
- struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
+ struct address_space *const mapping = inode->i_mapping;
erofs_off_t offset = blknr_to_addr(blkaddr);
pgoff_t index = offset >> PAGE_SHIFT;
struct page *page = buf->page;
@@ -60,6 +60,12 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
return buf->base + (offset & ~PAGE_MASK);
}
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+ return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
+}
+
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
@@ -252,12 +258,10 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return ret;
iomap->offset = map.m_la;
- if (flags & IOMAP_DAX) {
+ if (flags & IOMAP_DAX)
iomap->dax_dev = mdev.m_daxdev;
- iomap->offset += mdev.m_dax_part_off;
- } else {
+ else
iomap->bdev = mdev.m_bdev;
- }
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
@@ -284,6 +288,8 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
} else {
iomap->type = IOMAP_MAPPED;
iomap->addr = mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dax_part_off;
}
return 0;
}
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index eee9b0b31b63..18e59821c597 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "internal.h"
@@ -67,7 +68,7 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
static int erofs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
- struct address_space *mapping = dir->i_mapping;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
const size_t dirsize = i_size_read(dir);
unsigned int i = ctx->pos / EROFS_BLKSIZ;
unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
@@ -75,26 +76,19 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
bool initial = true;
while (ctx->pos < dirsize) {
- struct page *dentry_page;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- dentry_page = read_mapping_page(mapping, i, NULL);
- if (dentry_page == ERR_PTR(-ENOMEM)) {
- err = -ENOMEM;
- break;
- } else if (IS_ERR(dentry_page)) {
+ de = erofs_bread(&buf, dir, i, EROFS_KMAP);
+ if (IS_ERR(de)) {
erofs_err(dir->i_sb,
"fail to readdir of logical block %u of nid %llu",
i, EROFS_I(dir)->nid);
- err = -EFSCORRUPTED;
+ err = PTR_ERR(de);
break;
}
- de = (struct erofs_dirent *)kmap(dentry_page);
-
nameoff = le16_to_cpu(de->nameoff);
-
if (nameoff < sizeof(struct erofs_dirent) ||
nameoff >= PAGE_SIZE) {
erofs_err(dir->i_sb,
@@ -119,10 +113,6 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
err = erofs_fill_dentries(dir, ctx, de, &ofs,
nameoff, maxsize);
skip_this:
- kunmap(dentry_page);
-
- put_page(dentry_page);
-
ctx->pos = blknr_to_addr(i) + ofs;
if (err)
@@ -130,6 +120,7 @@ skip_this:
++i;
ofs = 0;
}
+ erofs_put_metabuf(&buf);
return err < 0 ? err : 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 3ea62c6fb00a..1238ca104f09 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -12,6 +12,7 @@
#define EROFS_SUPER_OFFSET 1024
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -186,8 +187,8 @@ struct erofs_inode_extended {
__le32 i_uid;
__le32 i_gid;
- __le64 i_ctime;
- __le32 i_ctime_nsec;
+ __le64 i_mtime;
+ __le32 i_mtime_nsec;
__le32 i_nlink;
__u8 i_reserved2[16];
};
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index ff62f84f47d3..e8b37ba5e9ad 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -113,8 +113,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
set_nlink(inode, le32_to_cpu(die->i_nlink));
/* extended inode has its own timestamp */
- inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);
+ inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
+ inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
inode->i_size = le64_to_cpu(die->i_size);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b8272fb95fd6..5298c4ee277d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -325,7 +325,7 @@ struct erofs_inode {
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
unsigned long z_tailextent_headlcn;
- unsigned int z_idataoff;
+ erofs_off_t z_idataoff;
unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
@@ -479,6 +479,8 @@ struct erofs_map_dev {
extern const struct file_operations erofs_file_fops;
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
erofs_blk_t blkaddr, enum erofs_kmap_type type);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 8629e616028c..554efa363317 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "xattr.h"
@@ -86,14 +87,14 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
return ERR_PTR(-ENOENT);
}
-static struct page *find_target_block_classic(struct inode *dir,
- struct erofs_qstr *name,
- int *_ndirents)
+static void *find_target_block_classic(struct erofs_buf *target,
+ struct inode *dir,
+ struct erofs_qstr *name,
+ int *_ndirents)
{
unsigned int startprfx, endprfx;
int head, back;
- struct address_space *const mapping = dir->i_mapping;
- struct page *candidate = ERR_PTR(-ENOENT);
+ void *candidate = ERR_PTR(-ENOENT);
startprfx = endprfx = 0;
head = 0;
@@ -101,10 +102,11 @@ static struct page *find_target_block_classic(struct inode *dir,
while (head <= back) {
const int mid = head + (back - head) / 2;
- struct page *page = read_mapping_page(mapping, mid, NULL);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_dirent *de;
- if (!IS_ERR(page)) {
- struct erofs_dirent *de = kmap_atomic(page);
+ de = erofs_bread(&buf, dir, mid, EROFS_KMAP);
+ if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff,
EROFS_BLKSIZ);
const int ndirents = nameoff / sizeof(*de);
@@ -113,13 +115,12 @@ static struct page *find_target_block_classic(struct inode *dir,
struct erofs_qstr dname;
if (!ndirents) {
- kunmap_atomic(de);
- put_page(page);
+ erofs_put_metabuf(&buf);
erofs_err(dir->i_sb,
"corrupted dir block %d @ nid %llu",
mid, EROFS_I(dir)->nid);
DBG_BUGON(1);
- page = ERR_PTR(-EFSCORRUPTED);
+ de = ERR_PTR(-EFSCORRUPTED);
goto out;
}
@@ -135,7 +136,6 @@ static struct page *find_target_block_classic(struct inode *dir,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- kunmap_atomic(de);
if (!diff) {
*_ndirents = 0;
@@ -145,11 +145,12 @@ static struct page *find_target_block_classic(struct inode *dir,
startprfx = matched;
if (!IS_ERR(candidate))
- put_page(candidate);
- candidate = page;
+ erofs_put_metabuf(target);
+ *target = buf;
+ candidate = de;
*_ndirents = ndirents;
} else {
- put_page(page);
+ erofs_put_metabuf(&buf);
back = mid - 1;
endprfx = matched;
@@ -158,8 +159,8 @@ static struct page *find_target_block_classic(struct inode *dir,
}
out: /* free if the candidate is valid */
if (!IS_ERR(candidate))
- put_page(candidate);
- return page;
+ erofs_put_metabuf(target);
+ return de;
}
return candidate;
}
@@ -169,8 +170,7 @@ int erofs_namei(struct inode *dir,
erofs_nid_t *nid, unsigned int *d_type)
{
int ndirents;
- struct page *page;
- void *data;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
struct erofs_qstr qn;
@@ -181,26 +181,20 @@ int erofs_namei(struct inode *dir,
qn.end = name->name + name->len;
ndirents = 0;
- page = find_target_block_classic(dir, &qn, &ndirents);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ de = find_target_block_classic(&buf, dir, &qn, &ndirents);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
- data = kmap_atomic(page);
/* the target page has been mapped */
if (ndirents)
- de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
- else
- de = (struct erofs_dirent *)data;
+ de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
if (!IS_ERR(de)) {
*nid = le64_to_cpu(de->nid);
*d_type = de->file_type;
}
-
- kunmap_atomic(data);
- put_page(page);
-
+ erofs_put_metabuf(&buf);
return PTR_ERR_OR_ZERO(de);
}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 915eefe0d7e2..0c4b41130c2f 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -84,7 +84,7 @@ static void erofs_inode_init_once(void *ptr)
static struct inode *erofs_alloc_inode(struct super_block *sb)
{
struct erofs_inode *vi =
- kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
+ alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL);
if (!vi)
return NULL;
@@ -281,21 +281,19 @@ static int erofs_init_devices(struct super_block *sb,
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
unsigned int blkszbits;
void *data;
int ret;
- page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
+ data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
- return PTR_ERR(page);
+ return PTR_ERR(data);
}
sbi = EROFS_SB(sb);
-
- data = kmap(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
ret = -EINVAL;
@@ -365,8 +363,7 @@ static int erofs_read_superblock(struct super_block *sb)
if (erofs_sb_has_ztailpacking(sbi))
erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
out:
- kunmap(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
return ret;
}
@@ -535,25 +532,29 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void erofs_managed_cache_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * We could introduce an extra locking instead but it seems unnecessary.
+ */
+static void erofs_managed_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- const unsigned int stop = length + offset;
+ const size_t stop = length + offset;
- DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(!folio_test_locked(folio));
/* Check for potential overflow in debug mode */
- DBG_BUGON(stop > PAGE_SIZE || stop < length);
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
- if (offset == 0 && stop == PAGE_SIZE)
- while (!erofs_managed_cache_releasepage(page, GFP_NOFS))
+ if (offset == 0 && stop == folio_size(folio))
+ while (!erofs_managed_cache_releasepage(&folio->page, GFP_NOFS))
cond_resched();
}
static const struct address_space_operations managed_cache_aops = {
.releasepage = erofs_managed_cache_releasepage,
- .invalidatepage = erofs_managed_cache_invalidatepage,
+ .invalidate_folio = erofs_managed_cache_invalidate_folio,
};
static int erofs_init_managed_cache(struct super_block *sb)
@@ -568,8 +569,7 @@ static int erofs_init_managed_cache(struct super_block *sb)
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping,
- GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
sbi->managed_cache = inode;
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index dac252bc9228..f3babf1e6608 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -221,9 +221,11 @@ void erofs_unregister_sysfs(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
- kobject_del(&sbi->s_kobj);
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
+ if (sbi->s_kobj.state_in_sysfs) {
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
}
int __init erofs_init_sysfs(void)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index db7de2dbac73..0ed880f42525 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -192,7 +192,10 @@ enum z_erofs_collectmode {
COLLECT_PRIMARY_FOLLOWED,
};
-struct z_erofs_collector {
+struct z_erofs_decompress_frontend {
+ struct inode *const inode;
+ struct erofs_map_blocks map;
+
struct z_erofs_pagevec_ctor vector;
struct z_erofs_pcluster *pcl, *tailpcl;
@@ -202,13 +205,6 @@ struct z_erofs_collector {
z_erofs_next_pcluster_t owned_head;
enum z_erofs_collectmode mode;
-};
-
-struct z_erofs_decompress_frontend {
- struct inode *const inode;
-
- struct z_erofs_collector clt;
- struct erofs_map_blocks map;
bool readahead;
/* used for applying cache strategy on the fly */
@@ -216,30 +212,30 @@ struct z_erofs_decompress_frontend {
erofs_off_t headoffset;
};
-#define COLLECTOR_INIT() { \
- .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED }
-
#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .clt = COLLECTOR_INIT(), \
- .backmost = true, }
+ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = COLLECT_PRIMARY_FOLLOWED }
static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
static DEFINE_MUTEX(z_pagemap_global_lock);
-static void preload_compressed_pages(struct z_erofs_collector *clt,
- struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct page **pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
+ enum z_erofs_cache_alloctype type,
+ struct page **pagepool)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
+ struct z_erofs_pcluster *pcl = fe->pcl;
bool standalone = true;
+ /*
+ * optimistic allocation without direct reclaim since inplace I/O
+ * can be used if low memory otherwise.
+ */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
struct page **pages;
pgoff_t index;
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED)
return;
pages = pcl->compressed_pages;
@@ -288,7 +284,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
* managed cache since it can be moved to the bypass queue instead.
*/
if (standalone)
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
}
/* called by erofs_shrinker to get rid of all compressed_pages */
@@ -350,47 +346,47 @@ int erofs_try_to_free_cached_page(struct page *page)
}
/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
struct page *page)
{
- struct z_erofs_pcluster *const pcl = clt->pcl;
+ struct z_erofs_pcluster *const pcl = fe->pcl;
- while (clt->icpage_ptr > pcl->compressed_pages)
- if (!cmpxchg(--clt->icpage_ptr, NULL, page))
+ while (fe->icpage_ptr > pcl->compressed_pages)
+ if (!cmpxchg(--fe->icpage_ptr, NULL, page))
return true;
return false;
}
/* callers must be with collection lock held */
-static int z_erofs_attach_page(struct z_erofs_collector *clt,
+static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct page *page, enum z_erofs_page_type type,
bool pvec_safereuse)
{
int ret;
/* give priority for inplaceio */
- if (clt->mode >= COLLECT_PRIMARY &&
+ if (fe->mode >= COLLECT_PRIMARY &&
type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- z_erofs_try_inplace_io(clt, page))
+ z_erofs_try_inplace_io(fe, page))
return 0;
- ret = z_erofs_pagevec_enqueue(&clt->vector, page, type,
+ ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
pvec_safereuse);
- clt->cl->vcnt += (unsigned int)ret;
+ fe->cl->vcnt += (unsigned int)ret;
return ret ? 0 : -EAGAIN;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
- z_erofs_next_pcluster_t *owned_head = &clt->owned_head;
+ struct z_erofs_pcluster *pcl = f->pcl;
+ z_erofs_next_pcluster_t *owned_head = &f->owned_head;
/* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
*owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
/* so we can attach this pcluster to our submission chain. */
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ f->mode = COLLECT_PRIMARY_FOLLOWED;
return;
}
@@ -401,24 +397,24 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- clt->mode = COLLECT_PRIMARY_HOOKED;
- clt->tailpcl = NULL;
+ f->mode = COLLECT_PRIMARY_HOOKED;
+ f->tailpcl = NULL;
return;
}
/* type 3, it belongs to a chain, but it isn't the end of the chain */
- clt->mode = COLLECT_PRIMARY;
+ f->mode = COLLECT_PRIMARY;
}
-static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
+static int z_erofs_lookup_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
- struct z_erofs_pcluster *pcl = clt->pcl;
+ struct z_erofs_pcluster *pcl = fe->pcl;
struct z_erofs_collection *cl;
unsigned int length;
/* to avoid unexpected loop formed by corrupted images */
- if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+ if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -449,15 +445,15 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
}
mutex_lock(&cl->lock);
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
- z_erofs_try_to_claim_pcluster(clt);
- clt->cl = cl;
+ z_erofs_try_to_claim_pcluster(fe);
+ fe->cl = cl;
return 0;
}
-static int z_erofs_register_collection(struct z_erofs_collector *clt,
+static int z_erofs_register_collection(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
@@ -485,8 +481,8 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
/* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = clt->owned_head;
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
+ pcl->next = fe->owned_head;
+ fe->mode = COLLECT_PRIMARY_FOLLOWED;
cl = z_erofs_primarycollection(pcl);
cl->pageofs = map->m_la & ~PAGE_MASK;
@@ -512,18 +508,18 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
}
if (grp != &pcl->obj) {
- clt->pcl = container_of(grp,
+ fe->pcl = container_of(grp,
struct z_erofs_pcluster, obj);
err = -EEXIST;
goto err_out;
}
}
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
- clt->owned_head = &pcl->next;
- clt->pcl = pcl;
- clt->cl = cl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
+ fe->owned_head = &pcl->next;
+ fe->pcl = pcl;
+ fe->cl = cl;
return 0;
err_out:
@@ -532,18 +528,18 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
struct inode *inode,
struct erofs_map_blocks *map)
{
struct erofs_workgroup *grp;
int ret;
- DBG_BUGON(clt->cl);
+ DBG_BUGON(fe->cl);
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
if (map->m_flags & EROFS_MAP_META) {
if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
@@ -555,28 +551,28 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
if (grp) {
- clt->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
} else {
tailpacking:
- ret = z_erofs_register_collection(clt, inode, map);
+ ret = z_erofs_register_collection(fe, inode, map);
if (!ret)
goto out;
if (ret != -EEXIST)
return ret;
}
- ret = z_erofs_lookup_collection(clt, inode, map);
+ ret = z_erofs_lookup_collection(fe, inode, map);
if (ret) {
- erofs_workgroup_put(&clt->pcl->obj);
+ erofs_workgroup_put(&fe->pcl->obj);
return ret;
}
out:
- z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- clt->cl->pagevec, clt->cl->vcnt);
+ z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+ fe->cl->pagevec, fe->cl->vcnt);
/* since file-backed online pages are traversed in reverse order */
- clt->icpage_ptr = clt->pcl->compressed_pages +
- z_erofs_pclusterpages(clt->pcl);
+ fe->icpage_ptr = fe->pcl->compressed_pages +
+ z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -610,24 +606,24 @@ static void z_erofs_collection_put(struct z_erofs_collection *cl)
erofs_workgroup_put(&pcl->obj);
}
-static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
{
- struct z_erofs_collection *cl = clt->cl;
+ struct z_erofs_collection *cl = fe->cl;
if (!cl)
return false;
- z_erofs_pagevec_ctor_exit(&clt->vector, false);
+ z_erofs_pagevec_ctor_exit(&fe->vector, false);
mutex_unlock(&cl->lock);
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+ if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
z_erofs_collection_put(cl);
- clt->cl = NULL;
+ fe->cl = NULL;
return true;
}
@@ -651,7 +647,6 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
- struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
bool tight = true;
@@ -672,7 +667,7 @@ repeat:
if (offset + cur >= map->m_la &&
offset + cur < map->m_la + map->m_llen) {
/* didn't get a valid collection previously (very rare) */
- if (!clt->cl)
+ if (!fe->cl)
goto restart_now;
goto hitted;
}
@@ -680,7 +675,7 @@ repeat:
/* go ahead the next map_blocks */
erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
- if (z_erofs_collector_end(clt))
+ if (z_erofs_collector_end(fe))
fe->backmost = false;
map->m_la = offset + cur;
@@ -693,11 +688,11 @@ restart_now:
if (!(map->m_flags & EROFS_MAP_MAPPED))
goto hitted;
- err = z_erofs_collector_begin(clt, inode, map);
+ err = z_erofs_collector_begin(fe, inode, map);
if (err)
goto err_out;
- if (z_erofs_is_inline_pcluster(clt->pcl)) {
+ if (z_erofs_is_inline_pcluster(fe->pcl)) {
void *mp;
mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
@@ -709,20 +704,18 @@ restart_now:
goto err_out;
}
get_page(fe->map.buf.page);
- WRITE_ONCE(clt->pcl->compressed_pages[0], fe->map.buf.page);
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page);
+ fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
} else {
- /* preload all compressed pages (can change mode if needed) */
+ /* bind cache first when cached decompression is preferred */
if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
map->m_la))
cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ z_erofs_bind_cache(fe, cache_strategy, pagepool);
}
-
hitted:
/*
* Ensure the current partial page belongs to this submit chain rather
@@ -730,8 +723,8 @@ hitted:
* those chains are handled asynchronously thus the page cannot be used
* for inplace I/O or pagevec (should be processed in strict order.)
*/
- tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
- clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED &&
+ fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -746,18 +739,18 @@ hitted:
Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
if (cur)
- tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type,
- clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ err = z_erofs_attach_page(fe, page, page_type,
+ fe->mode >= COLLECT_PRIMARY_FOLLOWED);
/* should allocate an additional short-lived page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
alloc_page(GFP_NOFS | __GFP_NOFAIL);
set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
- err = z_erofs_attach_page(clt, newpage,
+ err = z_erofs_attach_page(fe, newpage,
Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
if (!err)
goto retry;
@@ -773,7 +766,7 @@ retry:
/* bump up the number of spiltted parts of a page */
++spiltted;
/* also update nr_pages */
- clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+ fe->cl->nr_pages = max_t(pgoff_t, fe->cl->nr_pages, index + 1);
next_part:
/* can be used for verification */
map->m_llen = offset + cur - map->m_la;
@@ -810,68 +803,11 @@ static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
return false;
}
-static void z_erofs_decompressqueue_work(struct work_struct *work);
-static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
- bool sync, int bios)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
-
- /* wake up the caller thread for sync decompression */
- if (sync) {
- unsigned long flags;
-
- spin_lock_irqsave(&io->u.wait.lock, flags);
- if (!atomic_add_return(bios, &io->pending_bios))
- wake_up_locked(&io->u.wait);
- spin_unlock_irqrestore(&io->u.wait.lock, flags);
- return;
- }
-
- if (atomic_add_return(bios, &io->pending_bios))
- return;
- /* Use workqueue and sync decompression for atomic contexts only */
- if (in_atomic() || irqs_disabled()) {
- queue_work(z_erofs_workqueue, &io->u.work);
- /* enable sync decompression for readahead */
- if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
- sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
- return;
- }
- z_erofs_decompressqueue_work(&io->u.work);
-}
-
static bool z_erofs_page_is_invalidated(struct page *page)
{
return !page->mapping && !z_erofs_is_shortlived_page(page);
}
-static void z_erofs_decompressqueue_endio(struct bio *bio)
-{
- tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
- struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
- blk_status_t err = bio->bi_status;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
-
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(z_erofs_page_is_invalidated(page));
-
- if (err)
- SetPageError(page);
-
- if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
- if (!err)
- SetPageUptodate(page);
- unlock_page(page);
- }
- }
- z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
- bio_put(bio);
-}
-
static int z_erofs_decompress_pcluster(struct super_block *sb,
struct z_erofs_pcluster *pcl,
struct page **pagepool)
@@ -1123,13 +1059,42 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
kvfree(bgq);
}
+static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+ bool sync, int bios)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
+ /* wake up the caller thread for sync decompression */
+ if (sync) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&io->u.wait.lock, flags);
+ if (!atomic_add_return(bios, &io->pending_bios))
+ wake_up_locked(&io->u.wait);
+ spin_unlock_irqrestore(&io->u.wait.lock, flags);
+ return;
+ }
+
+ if (atomic_add_return(bios, &io->pending_bios))
+ return;
+ /* Use workqueue and sync decompression for atomic contexts only */
+ if (in_atomic() || irqs_disabled()) {
+ queue_work(z_erofs_workqueue, &io->u.work);
+ /* enable sync decompression for readahead */
+ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
+ sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
+ return;
+ }
+ z_erofs_decompressqueue_work(&io->u.work);
+}
+
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
struct page **pagepool,
- struct address_space *mc,
- gfp_t gfp)
+ struct address_space *mc)
{
const pgoff_t index = pcl->obj.index;
+ gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
struct address_space *mapping;
@@ -1300,6 +1265,33 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
+static void z_erofs_decompressqueue_endio(struct bio *bio)
+{
+ tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
+ struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
+ blk_status_t err = bio->bi_status;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+
+ DBG_BUGON(PageUptodate(page));
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
+
+ if (err)
+ SetPageError(page);
+
+ if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
+ if (!err)
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ }
+ z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
+ bio_put(bio);
+}
+
static void z_erofs_submit_queue(struct super_block *sb,
struct z_erofs_decompress_frontend *f,
struct page **pagepool,
@@ -1310,7 +1302,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
- z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
+ z_erofs_next_pcluster_t owned_head = f->owned_head;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
pgoff_t last_index;
struct block_device *last_bdev;
@@ -1358,8 +1350,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct page *page;
page = pickup_page_for_submission(pcl, i++, pagepool,
- MNGD_MAPPING(sbi),
- GFP_NOFS);
+ MNGD_MAPPING(sbi));
if (!page)
continue;
@@ -1417,7 +1408,7 @@ static void z_erofs_runqueue(struct super_block *sb,
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
@@ -1517,7 +1508,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
err = z_erofs_do_read_page(&f, page, &pagepool);
z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
- (void)z_erofs_collector_end(&f.clt);
+ (void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
z_erofs_runqueue(inode->i_sb, &f, &pagepool,
@@ -1567,7 +1558,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
put_page(page);
}
z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
- (void)z_erofs_collector_end(&f.clt);
+ (void)z_erofs_collector_end(&f);
z_erofs_runqueue(inode->i_sb, &f, &pagepool,
z_erofs_get_sync_decompress_policy(sbi, nr_pages));
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 18d7fd1a5064..572f0b8151ba 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -431,48 +431,47 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned int lookback_distance)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
- struct erofs_map_blocks *const map = m->map;
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn = m->lcn;
- int err;
- if (lcn < lookback_distance) {
- erofs_err(m->inode->i_sb,
- "bogus lookback distance @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
+ while (m->lcn >= lookback_distance) {
+ unsigned long lcn = m->lcn - lookback_distance;
+ int err;
- /* load extent head logical cluster if needed */
- lcn -= lookback_distance;
- err = z_erofs_load_cluster_from_disk(m, lcn, false);
- if (err)
- return err;
+ /* load extent head logical cluster if needed */
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ if (err)
+ return err;
- switch (m->type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- if (!m->delta[0]) {
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (!m->delta[0]) {
+ erofs_err(m->inode->i_sb,
+ "invalid lookback distance 0 @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lookback_distance = m->delta[0];
+ continue;
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
+ default:
erofs_err(m->inode->i_sb,
- "invalid lookback distance 0 @ nid %llu",
- vi->nid);
+ "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
DBG_BUGON(1);
- return -EFSCORRUPTED;
+ return -EOPNOTSUPP;
}
- return z_erofs_extent_lookback(m, m->delta[0]);
- case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
- m->headtype = m->type;
- map->m_la = (lcn << lclusterbits) | m->clusterofs;
- break;
- default:
- erofs_err(m->inode->i_sb,
- "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
- return 0;
+
+ erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
@@ -494,7 +493,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
- map->m_plen = 1 << lclusterbits;
+ map->m_plen = 1ULL << lclusterbits;
return 0;
}
lcn = m->lcn + 1;
@@ -540,7 +539,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
return -EFSCORRUPTED;
}
out:
- map->m_plen = m->compressedlcs << lclusterbits;
+ map->m_plen = (u64)m->compressedlcs << lclusterbits;
return 0;
err_bonus_cblkcnt:
erofs_err(m->inode->i_sb,
@@ -630,6 +629,13 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (endoff >= m.clusterofs) {
m.headtype = m.type;
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
break;
}
/* m.lcn should be >= 1 if endoff < m.clusterofs */
diff --git a/fs/exec.c b/fs/exec.c
index 79f2c9483302..a39108c1190a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -118,7 +118,7 @@ bool path_noexec(const struct path *path)
* Note that a shared library must be both readable and executable due to
* security reasons.
*
- * Also note that we take the address to load from from the file itself.
+ * Also note that we take the address to load from the file itself.
*/
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
@@ -495,8 +495,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm)
* the stack. They aren't stored until much later when we can't
* signal to the parent that the child has run out of stack space.
* Instead, calculate it here so it's possible to fail gracefully.
+ *
+ * In the case of argc = 0, make sure there is space for adding a
+ * empty string (which will bump argc to 1), to ensure confused
+ * userspace programs don't start processing from argv[1], thinking
+ * argc can never be 0, to keep them from walking envp by accident.
+ * See do_execveat_common().
*/
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
@@ -536,7 +542,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
if (!valid_arg_len(bprm, len))
goto out;
- /* We're going to work our way backwords. */
+ /* We're going to work our way backwards. */
pos = bprm->p;
str += len;
bprm->p -= len;
@@ -1269,7 +1275,7 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Must be called _before_ exec_mmap() as bprm->mm is
- * not visibile until then. This also enables the update
+ * not visible until then. This also enables the update
* to be lockless.
*/
retval = set_mm_exe_file(bprm->mm, bprm->file);
@@ -1303,12 +1309,6 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out_unlock;
- /*
- * Ensure that the uaccess routines can actually operate on userspace
- * pointers:
- */
- force_uaccess_begin();
-
if (me->flags & PF_KTHREAD)
free_kthread_struct(me);
me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
@@ -1897,6 +1897,9 @@ static int do_execveat_common(int fd, struct filename *filename,
}
retval = count(argv, MAX_ARG_STRINGS);
+ if (retval == 0)
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
+ current->comm, bprm->filename);
if (retval < 0)
goto out_free;
bprm->argc = retval;
@@ -1923,6 +1926,19 @@ static int do_execveat_common(int fd, struct filename *filename,
if (retval < 0)
goto out_free;
+ /*
+ * When argv is empty, add an empty string ("") as argv[0] to
+ * ensure confused userspace programs that start processing
+ * from argv[1] won't end up walking envp. See also
+ * bprm_stack_limits().
+ */
+ if (bprm->argc == 0) {
+ retval = copy_string_kernel("", bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = 1;
+ }
+
retval = bprm_execve(bprm, fd, filename, flags);
out_free:
free_bprm(bprm);
@@ -1951,6 +1967,8 @@ int kernel_execve(const char *kernel_filename,
}
retval = count_strings_kernel(argv);
+ if (WARN_ON_ONCE(retval == 0))
+ retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index df805bd05508..fc0ea1684880 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -490,7 +490,8 @@ int exfat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations exfat_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = exfat_readpage,
.readahead = exfat_readahead,
.writepage = exfat_writepage,
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8c9fb7dcec16..9f892903419a 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -183,7 +183,7 @@ static struct inode *exfat_alloc_inode(struct super_block *sb)
{
struct exfat_inode_info *ei;
- ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, exfat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index df14e750e9fe..998dd2ac8008 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,11 +170,6 @@ static void ext2_preread_inode(struct inode *inode)
unsigned long offset;
unsigned long block;
struct ext2_group_desc * gdp;
- struct backing_dev_info *bdi;
-
- bdi = inode_to_bdi(inode);
- if (bdi_rw_congested(bdi))
- return;
block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb);
gdp = ext2_get_group_desc(inode->i_sb, block_group, NULL);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 602578b72d8c..52377a0ee735 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -967,7 +967,8 @@ ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc
}
const struct address_space_operations ext2_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_writepage,
@@ -982,7 +983,8 @@ const struct address_space_operations ext2_aops = {
};
const struct address_space_operations ext2_nobh_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ext2_readpage,
.readahead = ext2_readahead,
.writepage = ext2_nobh_writepage,
@@ -998,8 +1000,7 @@ const struct address_space_operations ext2_nobh_aops = {
static const struct address_space_operations ext2_dax_aops = {
.writepages = ext2_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
};
/*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 94f1fbd7d3ac..f6a19f6d9f6d 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -180,7 +180,7 @@ static struct kmem_cache * ext2_inode_cachep;
static struct inode *ext2_alloc_inode(struct super_block *sb)
{
struct ext2_inode_info *ei;
- ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, ext2_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->i_block_alloc_info = NULL;
@@ -753,8 +753,12 @@ static loff_t ext2_max_size(int bits)
res += 1LL << (bits-2);
res += 1LL << (2*(bits-2));
res += 1LL << (3*(bits-2));
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
/* Does block tree limit file size? */
- if (res < upper_limit)
+ if (res + meta_blocks <= upper_limit)
goto check_lfs;
res = upper_limit;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 5a35768d6149..57e82e25f8e2 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -139,7 +139,7 @@ fail:
/*
* Inode operation get_posix_acl().
*
- * inode->i_mutex: don't care
+ * inode->i_rwsem: don't care
*/
struct posix_acl *
ext4_get_acl(struct inode *inode, int type, bool rcu)
@@ -183,7 +183,7 @@ ext4_get_acl(struct inode *inode, int type, bool rcu)
/*
* Set the access or default ACL of an inode.
*
- * inode->i_mutex: down unless called from ext4_new_inode
+ * inode->i_rwsem: down unless called from ext4_new_inode
*/
static int
__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
@@ -271,8 +271,8 @@ out_stop:
/*
* Initialize the ACLs of a new inode. Called from ext4_new_inode.
*
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
+ * dir->i_rwsem: down
+ * inode->i_rwsem: up (access to inode is still exclusive)
*/
int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a0fb0c4bdc7c..78ee3ef795ae 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -411,6 +411,7 @@ verified:
* ext4_read_block_bitmap_nowait()
* @sb: super block
* @block_group: given block group
+ * @ignore_locked: ignore locked buffers
*
* Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 4666b55b736e..5504f72bbbbe 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -292,15 +292,10 @@ void ext4_release_system_zone(struct super_block *sb)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
}
-/*
- * Returns 1 if the passed-in block region (start_blk,
- * start_blk+count) is valid; 0 if some part of the block region
- * overlaps with some other filesystem metadata blocks.
- */
-int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
- unsigned int count)
+int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_system_blocks *system_blks;
struct ext4_system_zone *entry;
struct rb_node *n;
@@ -329,7 +324,9 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right;
else {
- ret = (entry->ino == inode->i_ino);
+ ret = 0;
+ if (inode)
+ ret = (entry->ino == inode->i_ino);
break;
}
}
@@ -338,6 +335,17 @@ out_rcu:
return ret;
}
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with some other filesystem metadata blocks.
+ */
+int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
+ unsigned int count)
+{
+ return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
+}
+
int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max)
{
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 71a3cdceaa03..3f87cca49f0c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1028,7 +1028,7 @@ struct ext4_inode_info {
/*
* Extended attributes can be read independently of the main file
- * data. Taking i_mutex even when reading would cause contention
+ * data. Taking i_rwsem even when reading would cause contention
* between readers of EAs and writers of regular file data, so
* instead we synchronize on xattr_sem when reading or changing
* EAs.
@@ -1046,6 +1046,8 @@ struct ext4_inode_info {
/* Fast commit related info */
+ /* For tracking dentry create updates */
+ struct list_head i_fc_dilist;
struct list_head i_fc_list; /*
* inodes that need fast commit
* protected by sbi->s_fc_lock.
@@ -1279,7 +1281,7 @@ struct ext4_inode_info {
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
-extern void ext4_set_bits(void *bm, int cur, int len);
+extern void mb_set_bits(void *bm, int cur, int len);
/*
* Maximal mount counts between two filesystem checks
@@ -1750,6 +1752,7 @@ struct ext4_sb_info {
spinlock_t s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
+ tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
#endif
@@ -1795,10 +1798,7 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
enum {
EXT4_MF_MNTDIR_SAMPLED,
EXT4_MF_FS_ABORTED, /* Fatal error detected */
- EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
- EXT4_MF_FC_COMMITTING /* File system underoing a fast
- * commit.
- */
+ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
};
static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
@@ -2485,7 +2485,7 @@ struct ext4_filename {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
struct fscrypt_str cf_name;
#endif
};
@@ -2721,7 +2721,7 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
const struct qstr *iname,
struct ext4_filename *fname);
@@ -2754,7 +2754,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
ext4_fname_from_fscrypt_name(fname, &name);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
return err;
@@ -2773,7 +2773,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
ext4_fname_from_fscrypt_name(fname, &name);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
#endif
return err;
@@ -2790,7 +2790,7 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname)
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
#endif
@@ -2806,7 +2806,7 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
#endif
@@ -2822,7 +2822,7 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
kfree(fname->cf_name.name);
fname->cf_name.name = NULL;
#endif
@@ -2926,7 +2926,7 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
-void ext4_fc_mark_ineligible(struct super_block *sb, int reason);
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
@@ -2935,6 +2935,9 @@ void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk,
+ int len, int replay);
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
@@ -3407,7 +3410,7 @@ do { \
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif
-/* Update i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
@@ -3418,7 +3421,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
-/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
+/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
int changed = 0;
@@ -3706,6 +3709,9 @@ extern int ext4_inode_block_valid(struct inode *inode,
unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
+extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
+ ext4_fsblk_t start_blk, unsigned int count);
+
/* extents.c */
struct ext4_ext_path;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0e4fa644df01..db2ae4a2b38d 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -491,7 +491,7 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
- * i_mutex for direct I/O reads. This only works for extent-based
+ * i_rwsem for direct I/O reads. This only works for extent-based
* files, and it doesn't work if data journaling is enabled, since the
* dioread_nolock code uses b_private to pass information back to the
* I/O completion handler, and this conflicts with the jbd's use of
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74c91da585d7..0d98cf402282 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -97,7 +97,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode, 0);
@@ -3368,7 +3368,6 @@ static int ext4_split_extent(handle_t *handle,
return -EFSCORRUPTED;
}
unwritten = ext4_ext_is_unwritten(ex);
- split_flag1 = 0;
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
@@ -4572,7 +4571,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
/* Preallocate the range including the unaligned edges */
@@ -4738,7 +4737,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
@@ -5334,7 +5333,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
- ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
@@ -5474,7 +5473,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
ret = PTR_ERR(handle);
goto out_mmap;
}
- ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
@@ -5571,7 +5570,7 @@ out_mutex:
* stuff such as page-cache locking consistency, bh mapping consistency or
* extent's data copying must be performed by caller.
* Locking:
- * i_mutex is held for both inodes
+ * i_rwsem is held for both inodes
* i_data_sem is locked for write for both inodes
* Assumptions:
* All pages from requested range are locked for both inodes
@@ -6091,11 +6090,15 @@ int ext4_ext_clear_bb(struct inode *inode)
ext4_mb_mark_bb(inode->i_sb,
path[j].p_block, 1, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ 0, path[j].p_block, 1, 1);
}
ext4_ext_drop_refs(path);
kfree(path);
}
ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+ ext4_fc_record_regions(inode->i_sb, inode->i_ino,
+ map.m_lblk, map.m_pblk, map.m_len, 1);
}
cur = cur + map.m_len;
}
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 5ae8026a0c56..3d72565ec6e8 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -199,6 +199,7 @@ void ext4_fc_init_inode(struct inode *inode)
ext4_fc_reset_inode(inode);
ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
INIT_LIST_HEAD(&ei->i_fc_list);
+ INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
atomic_set(&ei->i_fc_updates, 0);
}
@@ -279,6 +280,8 @@ void ext4_fc_stop_update(struct inode *inode)
void ext4_fc_del(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_fc_dentry_update *fc_dentry;
if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
@@ -286,7 +289,7 @@ void ext4_fc_del(struct inode *inode)
restart:
spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list)) {
+ if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
return;
}
@@ -295,23 +298,62 @@ restart:
ext4_fc_wait_committing_inode(inode);
goto restart;
}
- list_del_init(&ei->i_fc_list);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+
+ if (!list_empty(&ei->i_fc_list))
+ list_del_init(&ei->i_fc_list);
+
+ /*
+ * Since this inode is getting removed, let's also remove all FC
+ * dentry create references, since it is not needed to log it anyways.
+ */
+ if (list_empty(&ei->i_fc_dilist)) {
+ spin_unlock(&sbi->s_fc_lock);
+ return;
+ }
+
+ fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+ WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
+ list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
+
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ spin_unlock(&sbi->s_fc_lock);
+
+ if (fc_dentry->fcd_name.name &&
+ fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
+ kfree(fc_dentry->fcd_name.name);
+ kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
+
+ return;
}
/*
- * Mark file system as fast commit ineligible. This means that next commit
- * operation would result in a full jbd2 commit.
+ * Mark file system as fast commit ineligible, and record latest
+ * ineligible transaction tid. This means until the recorded
+ * transaction, commit operation would result in a full jbd2 commit.
*/
-void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
+void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ tid_t tid;
if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
return;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ if (handle && !IS_ERR(handle))
+ tid = handle->h_transaction->t_tid;
+ else {
+ read_lock(&sbi->s_journal->j_state_lock);
+ tid = sbi->s_journal->j_running_transaction ?
+ sbi->s_journal->j_running_transaction->t_tid : 0;
+ read_unlock(&sbi->s_journal->j_state_lock);
+ }
+ spin_lock(&sbi->s_fc_lock);
+ if (sbi->s_fc_ineligible_tid < tid)
+ sbi->s_fc_ineligible_tid = tid;
+ spin_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -337,13 +379,6 @@ static int ext4_fc_track_template(
tid_t tid = 0;
int ret;
- if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
- (sbi->s_mount_state & EXT4_FC_REPLAY))
- return -EOPNOTSUPP;
-
- if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
- return -EINVAL;
-
tid = handle->h_transaction->t_tid;
mutex_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
@@ -361,7 +396,8 @@ static int ext4_fc_track_template(
spin_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
- (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
+ (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
spin_unlock(&sbi->s_fc_lock);
@@ -387,7 +423,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
mutex_unlock(&ei->i_fc_lock);
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -400,7 +436,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
if (!node->fcd_name.name) {
kmem_cache_free(ext4_fc_dentry_cachep, node);
ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_NOMEM);
+ EXT4_FC_REASON_NOMEM, NULL);
mutex_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -412,13 +448,28 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update)
node->fcd_name.name = node->fcd_iname;
}
node->fcd_name.len = dentry->d_name.len;
-
+ INIT_LIST_HEAD(&node->fcd_dilist);
spin_lock(&sbi->s_fc_lock);
- if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
+ if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
+ sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
&sbi->s_fc_dentry_q[FC_Q_STAGING]);
else
list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
+
+ /*
+ * This helps us keep a track of all fc_dentry updates which is part of
+ * this ext4 inode. So in case the inode is getting unlinked, before
+ * even we get a chance to fsync, we could remove all fc_dentry
+ * references while evicting the inode in ext4_fc_del().
+ * Also with this, we don't need to loop over all the inodes in
+ * sbi->s_fc_q to get the corresponding inode in
+ * ext4_fc_commit_dentry_updates().
+ */
+ if (dentry_update->op == EXT4_FC_TAG_CREAT) {
+ WARN_ON(!list_empty(&ei->i_fc_dilist));
+ list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
+ }
spin_unlock(&sbi->s_fc_lock);
mutex_lock(&ei->i_fc_lock);
@@ -436,12 +487,22 @@ void __ext4_fc_track_unlink(handle_t *handle,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_unlink(inode, dentry, ret);
+ trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
}
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_unlink(handle, inode, dentry);
}
void __ext4_fc_track_link(handle_t *handle,
@@ -455,12 +516,22 @@ void __ext4_fc_track_link(handle_t *handle,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_link(inode, dentry, ret);
+ trace_ext4_fc_track_link(handle, inode, dentry, ret);
}
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_link(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_link(handle, inode, dentry);
}
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
@@ -474,12 +545,22 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
(void *)&args, 0);
- trace_ext4_fc_track_create(inode, dentry, ret);
+ trace_ext4_fc_track_create(handle, inode, dentry, ret);
}
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
- __ext4_fc_track_create(handle, d_inode(dentry), dentry);
+ struct inode *inode = d_inode(dentry);
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
+ __ext4_fc_track_create(handle, inode, dentry);
}
/* __track_fn for inode tracking */
@@ -495,6 +576,7 @@ static int __track_inode(struct inode *inode, void *arg, bool update)
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
if (S_ISDIR(inode->i_mode))
@@ -502,12 +584,19 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
if (ext4_should_journal_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_INODE_JOURNAL_DATA);
+ EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
return;
}
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
- trace_ext4_fc_track_inode(inode, ret);
+ trace_ext4_fc_track_inode(handle, inode, ret);
}
struct __track_range_args {
@@ -545,18 +634,26 @@ static int __track_range(struct inode *inode, void *arg, bool update)
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
ext4_lblk_t end)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct __track_range_args args;
int ret;
if (S_ISDIR(inode->i_mode))
return;
+ if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+ (sbi->s_mount_state & EXT4_FC_REPLAY))
+ return;
+
+ if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
+ return;
+
args.start = start;
args.end = end;
ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
- trace_ext4_fc_track_range(inode, start, end, ret);
+ trace_ext4_fc_track_range(handle, inode, start, end, ret);
}
static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
@@ -879,7 +976,6 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal)
int ret = 0;
spin_lock(&sbi->s_fc_lock);
- ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
while (atomic_read(&ei->i_fc_updates)) {
@@ -939,7 +1035,7 @@ __releases(&sbi->s_fc_lock)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
struct inode *inode;
- struct ext4_inode_info *ei, *ei_n;
+ struct ext4_inode_info *ei;
int ret;
if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
@@ -955,21 +1051,16 @@ __releases(&sbi->s_fc_lock)
spin_lock(&sbi->s_fc_lock);
continue;
}
-
- inode = NULL;
- list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
- i_fc_list) {
- if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
- inode = &ei->vfs_inode;
- break;
- }
- }
/*
- * If we don't find inode in our list, then it was deleted,
- * in which case, we don't need to record it's create tag.
+ * With fcd_dilist we need not loop in sbi->s_fc_q to get the
+ * corresponding inode pointer
*/
- if (!inode)
- continue;
+ WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ ei = list_first_entry(&fc_dentry->fcd_dilist,
+ struct ext4_inode_info, i_fc_dilist);
+ inode = &ei->vfs_inode;
+ WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
+
spin_unlock(&sbi->s_fc_lock);
/*
@@ -1073,11 +1164,12 @@ out:
}
static void ext4_fc_update_stats(struct super_block *sb, int status,
- u64 commit_time, int nblks)
+ u64 commit_time, int nblks, tid_t commit_tid)
{
struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
- jbd_debug(1, "Fast commit ended with status = %d", status);
+ jbd_debug(1, "Fast commit ended with status = %d for tid %u",
+ status, commit_tid);
if (status == EXT4_FC_STATUS_OK) {
stats->fc_num_commits++;
stats->fc_numblks += nblks;
@@ -1095,7 +1187,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status,
} else {
stats->fc_skipped_commits++;
}
- trace_ext4_fc_commit_stop(sb, nblks, status);
+ trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
}
/*
@@ -1113,13 +1205,13 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time;
- trace_ext4_fc_commit_start(sb);
-
- start_time = ktime_get();
-
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return jbd2_complete_transaction(journal, commit_tid);
+ trace_ext4_fc_commit_start(sb, commit_tid);
+
+ start_time = ktime_get();
+
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid);
if (ret == -EALREADY) {
@@ -1127,14 +1219,16 @@ restart_fc:
if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
commit_tid > journal->j_commit_sequence)
goto restart_fc;
- ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0);
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
+ commit_tid);
return 0;
} else if (ret) {
/*
* Commit couldn't start. Just update stats and perform a
* full commit.
*/
- ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0);
+ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
+ commit_tid);
return jbd2_complete_transaction(journal, commit_tid);
}
@@ -1166,12 +1260,12 @@ restart_fc:
* don't react too strongly to vast changes in the commit time
*/
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
- ext4_fc_update_stats(sb, status, commit_time, nblks);
+ ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
return ret;
fallback:
ret = jbd2_fc_end_commit_fallback(journal);
- ext4_fc_update_stats(sb, status, 0, 0);
+ ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
return ret;
}
@@ -1179,7 +1273,7 @@ fallback:
* Fast commit cleanup routine. This is called after every fast commit and
* full commit. full is true if we are called after a full commit.
*/
-static void ext4_fc_cleanup(journal_t *journal, int full)
+static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1189,6 +1283,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
if (full && sbi->s_fc_bh)
sbi->s_fc_bh = NULL;
+ trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
spin_lock(&sbi->s_fc_lock);
@@ -1197,7 +1292,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- ext4_fc_reset_inode(&iter->vfs_inode);
+ if (iter->i_sync_tid <= tid)
+ ext4_fc_reset_inode(&iter->vfs_inode);
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb();
#if (BITS_PER_LONG < 64)
@@ -1212,6 +1308,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
struct ext4_fc_dentry_update,
fcd_list);
list_del_init(&fc_dentry->fcd_list);
+ list_del_init(&fc_dentry->fcd_dilist);
spin_unlock(&sbi->s_fc_lock);
if (fc_dentry->fcd_name.name &&
@@ -1226,8 +1323,10 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_MAIN]);
- ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
- ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ if (tid >= sbi->s_fc_ineligible_tid) {
+ sbi->s_fc_ineligible_tid = 0;
+ ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
+ }
if (full)
sbi->s_fc_bytes = 0;
@@ -1392,14 +1491,15 @@ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
if (state->fc_modified_inodes[i] == ino)
return 0;
if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
- state->fc_modified_inodes_size +=
- EXT4_FC_REPLAY_REALLOC_INCREMENT;
state->fc_modified_inodes = krealloc(
- state->fc_modified_inodes, sizeof(int) *
- state->fc_modified_inodes_size,
- GFP_KERNEL);
+ state->fc_modified_inodes,
+ sizeof(int) * (state->fc_modified_inodes_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
if (!state->fc_modified_inodes)
return -ENOMEM;
+ state->fc_modified_inodes_size +=
+ EXT4_FC_REPLAY_REALLOC_INCREMENT;
}
state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
return 0;
@@ -1431,7 +1531,9 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
}
inode = NULL;
- ext4_fc_record_modified_inode(sb, ino);
+ ret = ext4_fc_record_modified_inode(sb, ino);
+ if (ret)
+ goto out;
raw_fc_inode = (struct ext4_inode *)
(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
@@ -1563,16 +1665,23 @@ out:
}
/*
- * Record physical disk regions which are in use as per fast commit area. Our
- * simple replay phase allocator excludes these regions from allocation.
+ * Record physical disk regions which are in use as per fast commit area,
+ * and used by inodes during replay phase. Our simple replay phase
+ * allocator excludes these regions from allocation.
*/
-static int ext4_fc_record_regions(struct super_block *sb, int ino,
- ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
+int ext4_fc_record_regions(struct super_block *sb, int ino,
+ ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
{
struct ext4_fc_replay_state *state;
struct ext4_fc_alloc_region *region;
state = &EXT4_SB(sb)->s_fc_replay_state;
+ /*
+ * during replay phase, the fc_regions_valid may not same as
+ * fc_regions_used, update it when do new additions.
+ */
+ if (replay && state->fc_regions_used != state->fc_regions_valid)
+ state->fc_regions_used = state->fc_regions_valid;
if (state->fc_regions_used == state->fc_regions_size) {
state->fc_regions_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
@@ -1590,6 +1699,9 @@ static int ext4_fc_record_regions(struct super_block *sb, int ino,
region->pblk = pblk;
region->len = len;
+ if (replay)
+ state->fc_regions_valid++;
+
return 0;
}
@@ -1621,6 +1733,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
}
ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
start = le32_to_cpu(ex->ee_block);
start_pblk = ext4_ext_pblock(ex);
@@ -1638,18 +1752,14 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
map.m_pblk = 0;
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0) {
- iput(inode);
- return 0;
- }
+ if (ret < 0)
+ goto out;
if (ret == 0) {
/* Range is not mapped */
path = ext4_find_extent(inode, cur, NULL, 0);
- if (IS_ERR(path)) {
- iput(inode);
- return 0;
- }
+ if (IS_ERR(path))
+ goto out;
memset(&newex, 0, sizeof(newex));
newex.ee_block = cpu_to_le32(cur);
ext4_ext_store_pblock(
@@ -1663,10 +1773,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
up_write((&EXT4_I(inode)->i_data_sem));
ext4_ext_drop_refs(path);
kfree(path);
- if (ret) {
- iput(inode);
- return 0;
- }
+ if (ret)
+ goto out;
goto next;
}
@@ -1679,10 +1787,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
ext4_ext_is_unwritten(ex),
start_pblk + cur - start);
- if (ret) {
- iput(inode);
- return 0;
- }
+ if (ret)
+ goto out;
/*
* Mark the old blocks as free since they aren't used
* anymore. We maintain an array of all the modified
@@ -1702,10 +1808,8 @@ static int ext4_fc_replay_add_range(struct super_block *sb,
ext4_ext_is_unwritten(ex), map.m_pblk);
ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
ext4_ext_is_unwritten(ex), map.m_pblk);
- if (ret) {
- iput(inode);
- return 0;
- }
+ if (ret)
+ goto out;
/*
* We may have split the extent tree while toggling the state.
* Try to shrink the extent tree now.
@@ -1717,6 +1821,7 @@ next:
}
ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
sb->s_blocksize_bits);
+out:
iput(inode);
return 0;
}
@@ -1746,6 +1851,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
}
ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+ if (ret)
+ goto out;
jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
inode->i_ino, le32_to_cpu(lrange.fc_lblk),
@@ -1755,10 +1862,8 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
map.m_len = remaining;
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret < 0) {
- iput(inode);
- return 0;
- }
+ if (ret < 0)
+ goto out;
if (ret > 0) {
remaining -= ret;
cur += ret;
@@ -1770,18 +1875,17 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
}
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_ext_remove_space(inode, lrange.fc_lblk,
- lrange.fc_lblk + lrange.fc_len - 1);
+ ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
+ le32_to_cpu(lrange.fc_lblk) +
+ le32_to_cpu(lrange.fc_len) - 1);
up_write(&EXT4_I(inode)->i_data_sem);
- if (ret) {
- iput(inode);
- return 0;
- }
+ if (ret)
+ goto out;
ext4_ext_replay_shrink_inode(inode,
i_size_read(inode) >> sb->s_blocksize_bits);
ext4_mark_inode_dirty(NULL, inode);
+out:
iput(inode);
-
return 0;
}
@@ -1852,8 +1956,8 @@ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
if (state->fc_regions[i].ino == 0 ||
state->fc_regions[i].len == 0)
continue;
- if (blk >= state->fc_regions[i].pblk &&
- blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
+ if (in_range(blk, state->fc_regions[i].pblk,
+ state->fc_regions[i].len))
return true;
}
return false;
@@ -1937,7 +2041,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
ret = ext4_fc_record_regions(sb,
le32_to_cpu(ext.fc_ino),
le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
- ext4_ext_get_actual_len(ex));
+ ext4_ext_get_actual_len(ex), 0);
if (ret < 0)
break;
ret = JBD2_FC_REPLAY_CONTINUE;
diff --git a/fs/ext4/fast_commit.h b/fs/ext4/fast_commit.h
index 083ad1cb705a..1db12847a83b 100644
--- a/fs/ext4/fast_commit.h
+++ b/fs/ext4/fast_commit.h
@@ -55,13 +55,13 @@ struct ext4_fc_del_range {
struct ext4_fc_dentry_info {
__le32 fc_parent_ino;
__le32 fc_ino;
- __u8 fc_dname[0];
+ __u8 fc_dname[];
};
/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
struct ext4_fc_inode {
__le32 fc_ino;
- __u8 fc_raw_inode[0];
+ __u8 fc_raw_inode[];
};
/* Value structure for tag EXT4_FC_TAG_TAIL. */
@@ -93,7 +93,6 @@ enum {
EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA,
- EXT4_FC_COMMIT_FAILED,
EXT4_FC_REASON_MAX
};
@@ -109,6 +108,7 @@ struct ext4_fc_dentry_update {
struct qstr fcd_name; /* Dirent name */
unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
struct list_head fcd_list;
+ struct list_head fcd_dilist;
};
struct ext4_fc_stats {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8cc11715518a..8bd66cdc41be 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -36,9 +36,11 @@
#include "acl.h"
#include "truncate.h"
-static bool ext4_dio_supported(struct inode *inode)
+static bool ext4_dio_supported(struct kiocb *iocb, struct iov_iter *iter)
{
- if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (!fscrypt_dio_supported(iocb, iter))
return false;
if (fsverity_active(inode))
return false;
@@ -61,7 +63,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
inode_lock_shared(inode);
}
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, to)) {
inode_unlock_shared(inode);
/*
* Fallback to buffered I/O if the operation being performed on
@@ -509,7 +511,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
/* Fallback to buffered I/O if the inode does not support direct I/O. */
- if (!ext4_dio_supported(inode)) {
+ if (!ext4_dio_supported(iocb, from)) {
if (ilock_shared)
inode_unlock_shared(inode);
else
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index f34f4176c1e7..147b5241dd94 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -290,7 +290,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
const struct unicode_map *um = dir->i_sb->s_encoding;
int r, dlen;
unsigned char *buff;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 89efa78ed4b2..07a8c75b65ed 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -696,7 +696,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
* moment, get_block can be called only for blocks inside i_size since
* page cache has been already dropped and writes are blocked by
- * i_mutex. So we can safely drop the i_data_sem here.
+ * i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
ext4_discard_preallocations(inode, 0);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 635bcf68a67e..9c076262770d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -911,7 +911,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct page **pagep,
void **fsdata)
{
- int ret, inline_size;
+ int ret;
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
@@ -928,14 +928,9 @@ retry_journal:
goto out;
}
- inline_size = ext4_get_max_inline_size(inode);
-
- ret = -ENOSPC;
- if (inline_size >= pos + len) {
- ret = ext4_prepare_inline_data(handle, inode, pos + len);
- if (ret && ret != -ENOSPC)
- goto out_journal;
- }
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out_journal;
/*
* We cannot recurse into the filesystem as the transaction
@@ -1133,7 +1128,15 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc,
void *buf, int inline_size)
{
- ext4_create_inline_data(handle, inode, inline_size);
+ int ret;
+
+ ret = ext4_create_inline_data(handle, inode, inline_size);
+ if (ret) {
+ ext4_msg(inode->i_sb, KERN_EMERG,
+ "error restoring inline_data for inode -- potential data loss! (inode %lu, error %d)",
+ inode->i_ino, ret);
+ return;
+ }
ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
@@ -1780,19 +1783,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
- bool ret = true;
+ bool ret = false;
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
EXT4_ERROR_INODE_ERR(dir, -err,
"error %d getting inode %lu block",
err, dir->i_ino);
- return true;
+ return false;
}
down_read(&EXT4_I(dir)->xattr_sem);
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
+ ret = true;
goto out;
}
@@ -1801,7 +1805,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'",
dir->i_ino);
- ret = true;
goto out;
}
@@ -1820,16 +1823,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
- ret = true;
goto out;
}
if (le32_to_cpu(de->inode)) {
- ret = false;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
}
+ ret = true;
out:
up_read(&EXT4_I(dir)->xattr_sem);
brelse(iloc.bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5f79d265d06a..1ce13f69fbec 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -137,8 +137,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length);
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
@@ -186,7 +184,7 @@ void ext4_evict_inode(struct inode *inode)
* journal. So although mm thinks everything is clean and
* ready for reaping the inode might still have some pages to
* write in the running transaction or waiting to be
- * checkpointed. Thus calling jbd2_journal_invalidatepage()
+ * checkpointed. Thus calling jbd2_journal_invalidate_folio()
* (via truncate_inode_pages()) to discard these buffers can
* cause data loss. Also even if we did not discard these
* buffers, we would have no way to find them after the inode
@@ -338,7 +336,7 @@ stop_handle:
return;
no_delete:
if (!list_empty(&EXT4_I(inode)->i_fc_list))
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
}
@@ -1224,7 +1222,7 @@ retry_journal:
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
- * i_size_read because we hold i_mutex.
+ * i_size_read because we hold i_rwsem.
*
* Add inode to orphan list in case we crash before
* truncate finishes
@@ -1571,16 +1569,18 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ struct folio *folio = page_folio(page);
- BUG_ON(!PageLocked(page));
- BUG_ON(PageWriteback(page));
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio_test_writeback(folio));
if (invalidate) {
- if (page_mapped(page))
- clear_page_dirty_for_io(page);
- block_invalidatepage(page, 0, PAGE_SIZE);
- ClearPageUptodate(page);
+ if (folio_mapped(folio))
+ folio_clear_dirty_for_io(folio);
+ block_invalidate_folio(folio, 0,
+ folio_size(folio));
+ folio_clear_uptodate(folio);
}
- unlock_page(page);
+ folio_unlock(folio);
}
pagevec_release(&pvec);
}
@@ -1971,6 +1971,7 @@ out_no_pagelock:
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
int ret = 0;
loff_t size;
unsigned int len;
@@ -1980,8 +1981,8 @@ static int ext4_writepage(struct page *page,
bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
- inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ folio_invalidate(folio, 0, folio_size(folio));
+ folio_unlock(folio);
return -EIO;
}
@@ -1993,6 +1994,15 @@ static int ext4_writepage(struct page *page,
else
len = PAGE_SIZE;
+ /* Should never happen but for bugs in other kernel subsystems */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(inode,
+ "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ return 0;
+ }
+
page_bufs = page_buffers(page);
/*
* We cannot do block allocation or other extent handling in this
@@ -2594,6 +2604,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * Should never happen but for buggy code in
+ * other subsystems that call
+ * set_page_dirty() without properly warning
+ * the file system first. See [1] for more
+ * information.
+ *
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
+ */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ continue;
+ }
+
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -3182,40 +3208,39 @@ static void ext4_readahead(struct readahead_control *rac)
ext4_mpage_readpages(inode, rac, NULL);
}
-static void ext4_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ext4_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- trace_ext4_invalidatepage(page, offset, length);
+ trace_ext4_invalidate_folio(folio, offset, length);
/* No journalling happens on data buffers when this function is used */
- WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
+ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));
- block_invalidatepage(page, offset, length);
+ block_invalidate_folio(folio, offset, length);
}
-static int __ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static int __ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- journal_t *journal = EXT4_JOURNAL(page->mapping->host);
+ journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
- trace_ext4_journalled_invalidatepage(page, offset, length);
+ trace_ext4_journalled_invalidate_folio(folio, offset, length);
/*
* If it's a full truncate we just forget about the pending dirtying
*/
- if (offset == 0 && length == PAGE_SIZE)
- ClearPageChecked(page);
+ if (offset == 0 && length == folio_size(folio))
+ folio_clear_checked(folio);
- return jbd2_journal_invalidatepage(journal, page, offset, length);
+ return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}
/* Wrapper for aops... */
-static void ext4_journalled_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void ext4_journalled_invalidate_folio(struct folio *folio,
+ size_t offset,
+ size_t length)
{
- WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
+ WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}
static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3409,6 +3434,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (ret < 0)
return ret;
out:
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -3541,29 +3573,32 @@ const struct iomap_ops ext4_iomap_report_ops = {
};
/*
- * Pages can be marked dirty completely asynchronously from ext4's journalling
- * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
- * much here because ->set_page_dirty is called under VFS locks. The page is
- * not necessarily locked.
+ * Whenever the folio is being dirtied, corresponding buffers should already
+ * be attached to the transaction (we take care of this in ext4_page_mkwrite()
+ * and ext4_write_begin()). However we cannot move buffers to dirty transaction
+ * lists here because ->dirty_folio is called under VFS locks and the folio
+ * is not necessarily locked.
*
- * We cannot just dirty the page and leave attached buffers clean, because the
+ * We cannot just dirty the folio and leave attached buffers clean, because the
* buffers' dirty state is "definitive". We cannot just set the buffers dirty
* or jbddirty because all the journalling code will explode.
*
- * So what we do is to mark the page "pending dirty" and next time writepage
+ * So what we do is to mark the folio "pending dirty" and next time writepage
* is called, propagate that into the buffers appropriately.
*/
-static int ext4_journalled_set_page_dirty(struct page *page)
+static bool ext4_journalled_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ WARN_ON_ONCE(!page_has_buffers(&folio->page));
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
-static int ext4_set_page_dirty(struct page *page)
+static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
- WARN_ON_ONCE(!page_has_buffers(page));
- return __set_page_dirty_buffers(page);
+ WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
+ WARN_ON_ONCE(!folio_buffers(folio));
+ return block_dirty_folio(mapping, folio);
}
static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
@@ -3580,9 +3615,9 @@ static const struct address_space_operations ext4_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidate_folio = ext4_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
@@ -3598,9 +3633,9 @@ static const struct address_space_operations ext4_journalled_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
+ .dirty_folio = ext4_journalled_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_journalled_invalidatepage,
+ .invalidate_folio = ext4_journalled_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -3615,9 +3650,9 @@ static const struct address_space_operations ext4_da_aops = {
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
.write_end = ext4_da_write_end,
- .set_page_dirty = ext4_set_page_dirty,
+ .dirty_folio = ext4_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
+ .invalidate_folio = ext4_invalidate_folio,
.releasepage = ext4_releasepage,
.direct_IO = noop_direct_IO,
.migratepage = buffer_migrate_page,
@@ -3629,9 +3664,8 @@ static const struct address_space_operations ext4_da_aops = {
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
.bmap = ext4_bmap,
- .invalidatepage = noop_invalidatepage,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3979,7 +4013,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
- /* Wait all existing dio workers, newcomers will block on i_mutex */
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
inode_dio_wait(inode);
/*
@@ -4129,7 +4163,7 @@ int ext4_truncate(struct inode *inode)
/*
* There is a possibility that we're either freeing the inode
* or it's a completely new inode. In those cases we might not
- * have i_mutex locked because it's not necessary.
+ * have i_rwsem locked because it's not necessary.
*/
if (!(inode->i_state & (I_NEW|I_FREEING)))
WARN_ON(!inode_is_locked(inode));
@@ -5204,13 +5238,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
}
/*
- * In data=journal mode ext4_journalled_invalidatepage() may fail to invalidate
- * buffers that are attached to a page stradding i_size and are undergoing
+ * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
+ * buffers that are attached to a folio straddling i_size and are undergoing
* commit. In that case we have to wait for commit to finish and try again.
*/
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
- struct page *page;
unsigned offset;
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
tid_t commit_tid = 0;
@@ -5218,25 +5251,25 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * If the page is fully truncated, we don't need to wait for any commit
- * (and we even should not as __ext4_journalled_invalidatepage() may
- * strip all buffers from the page but keep the page dirty which can then
- * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * If the folio is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidate_folio() may
+ * strip all buffers from the folio but keep the folio dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
* buffers). Also we don't need to wait for any commit if all buffers in
- * the page remain valid. This is most beneficial for the common case of
+ * the folio remain valid. This is most beneficial for the common case of
* blocksize == PAGESIZE.
*/
if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
- page = find_lock_page(inode->i_mapping,
+ struct folio *folio = filemap_lock_folio(inode->i_mapping,
inode->i_size >> PAGE_SHIFT);
- if (!page)
+ if (!folio)
return;
- ret = __ext4_journalled_invalidatepage(page, offset,
- PAGE_SIZE - offset);
- unlock_page(page);
- put_page(page);
+ ret = __ext4_journalled_invalidate_folio(folio, offset,
+ folio_size(folio) - offset);
+ folio_unlock(folio);
+ folio_put(folio);
if (ret != -EBUSY)
return;
commit_tid = 0;
@@ -5271,7 +5304,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
* transaction are already on disk (truncate waits for pages under
* writeback).
*
- * Called with inode->i_mutex down.
+ * Called with inode->i_rwsem down.
*/
int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr)
@@ -5983,7 +6016,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return PTR_ERR(handle);
ext4_fc_mark_ineligible(inode->i_sb,
- EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
+ EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
err = ext4_mark_inode_dirty(handle, inode);
ext4_handle_sync(handle);
ext4_journal_stop(handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bbbedf27b71c..992229ca2d83 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -269,7 +269,7 @@ out:
return err ? err : 0;
}
-/**
+/*
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
@@ -290,7 +290,7 @@ static void memswap(void *a, void *b, size_t len)
}
}
-/**
+/*
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
@@ -344,7 +344,7 @@ void ext4_reset_inode_seed(struct inode *inode)
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
}
-/**
+/*
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
@@ -411,7 +411,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
err = -EINVAL;
goto err_out;
}
- ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle);
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
@@ -1373,7 +1373,7 @@ mext_out:
err = ext4_resize_fs(sb, n_blocks_count);
if (EXT4_SB(sb)->s_journal) {
- ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
+ ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9f86dd947032..252c168454c7 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1000,7 +1000,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
return 0;
if (ac->ac_criteria >= 2)
return 0;
- if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
+ if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
return 0;
return 1;
}
@@ -1689,7 +1689,7 @@ static int mb_test_and_clear_bits(void *bm, int cur, int len)
return zero_bit;
}
-void ext4_set_bits(void *bm, int cur, int len)
+void mb_set_bits(void *bm, int cur, int len)
{
__u32 *addr;
@@ -1996,7 +1996,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info);
- ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
+ mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
mb_check_buddy(e4b);
return ret;
@@ -3825,7 +3825,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
* We leak some of the blocks here.
*/
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3844,7 +3844,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
}
#endif
- ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+ mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
@@ -3899,69 +3899,103 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i, clen, err;
+ int i, err;
int already;
+ unsigned int clen, clen_changed, thisgrp_len;
- clen = EXT4_B2C(sbi, len);
+ while (len > 0) {
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto out_err;
- }
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ * In case of flex_bg, this can happen that (block, len) may
+ * span across more than one group. In that case we need to
+ * get the corresponding group metadata to work with.
+ * For this we have goto again loop.
+ */
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
+
+ if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) {
+ ext4_error(sb, "Marking blocks in system zone - "
+ "Block = %llu, len = %u",
+ block, thisgrp_len);
+ bitmap_bh = NULL;
+ break;
+ }
- err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
- goto out_err;
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ break;
+ }
- ext4_lock_group(sb, group);
- already = 0;
- for (i = 0; i < clen; i++)
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
- already++;
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ break;
- if (state)
- ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
- else
- mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb,
- group, gdp));
- }
- if (state)
- clen = ext4_free_group_clusters(sb, gdp) - clen + already;
- else
- clen = ext4_free_group_clusters(sb, gdp) + clen - already;
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ !state)
+ already++;
+
+ clen_changed = clen - already;
+ if (state)
+ mb_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
- ext4_free_group_clusters_set(sb, gdp, clen);
- ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
- ext4_unlock_group(sb, group);
+ ext4_unlock_group(sb, group);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
- atomic64_sub(len,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
+ if (state)
+ atomic64_sub(clen_changed, &fg->free_clusters);
+ else
+ atomic64_add(clen_changed, &fg->free_clusters);
+
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ break;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+ if (err)
+ break;
+
+ block += thisgrp_len;
+ len -= thisgrp_len;
+ brelse(bitmap_bh);
+ BUG_ON(len < 0);
}
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
if (err)
- goto out_err;
- sync_dirty_buffer(bitmap_bh);
- err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(gdp_bh);
-
-out_err:
- brelse(bitmap_bh);
+ brelse(bitmap_bh);
}
/*
@@ -4433,7 +4467,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
while (n) {
entry = rb_entry(n, struct ext4_free_data, efd_node);
- ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
+ mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
return;
@@ -4474,7 +4508,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
if (unlikely(len == 0))
continue;
BUG_ON(groupnr != group);
- ext4_set_bits(bitmap, start, len);
+ mb_set_bits(bitmap, start, len);
preallocated += len;
}
mb_debug(sb, "preallocated %d for group %u\n", preallocated, group);
@@ -5753,7 +5787,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
struct super_block *sb = ar->inode->i_sb;
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i = sb->s_blocksize;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
ext4_fsblk_t goal, block;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -5775,19 +5810,26 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
ext4_get_group_no_and_offset(sb,
max(ext4_group_first_block_no(sb, group), goal),
NULL, &blkoff);
- i = mb_find_next_zero_bit(bitmap_bh->b_data, sb->s_blocksize,
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) + i)) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
brelse(bitmap_bh);
- if (i >= sb->s_blocksize)
- continue;
- if (ext4_fc_replay_check_excluded(sb,
- ext4_group_first_block_no(sb, group) + i))
- continue;
- break;
+ if (i < max)
+ break;
}
- if (group >= ext4_get_groups_count(sb) && i >= sb->s_blocksize)
+ if (group >= ext4_get_groups_count(sb) || i >= max) {
+ *errp = -ENOSPC;
return 0;
+ }
block = ext4_group_first_block_no(sb, group) + i;
ext4_mb_mark_bb(sb, block, 1, 1);
@@ -5838,17 +5880,17 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
}
/**
- * ext4_free_blocks() -- Free given blocks and update quota
+ * ext4_mb_clear_bb() -- helper function for freeing blocks.
+ * Used by ext4_free_blocks()
* @handle: handle for this transaction
* @inode: inode
- * @bh: optional buffer of the block to be freed
* @block: starting physical block to be freed
* @count: number of blocks to be freed
* @flags: flags used by ext4_free_blocks
*/
-void ext4_free_blocks(handle_t *handle, struct inode *inode,
- struct buffer_head *bh, ext4_fsblk_t block,
- unsigned long count, int flags)
+static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t block, unsigned long count,
+ int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
@@ -5865,80 +5907,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
- if (sbi->s_mount_state & EXT4_FC_REPLAY) {
- ext4_free_blocks_simple(inode, block, count);
- return;
- }
-
- might_sleep();
- if (bh) {
- if (block)
- BUG_ON(block != bh->b_blocknr);
- else
- block = bh->b_blocknr;
- }
-
- if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
- !ext4_inode_block_valid(inode, block, count)) {
- ext4_error(sb, "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
- goto error_return;
- }
-
- ext4_debug("freeing block %llu\n", block);
- trace_ext4_free_blocks(inode, block, count, flags);
-
- if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- BUG_ON(count > 1);
-
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, bh, block);
- }
-
- /*
- * If the extent to be freed does not begin on a cluster
- * boundary, we need to deal with partial clusters at the
- * beginning and end of the extent. Normally we will free
- * blocks at the beginning or the end unless we are explicitly
- * requested to avoid doing so.
- */
- overflow = EXT4_PBLK_COFF(sbi, block);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
- overflow = sbi->s_cluster_ratio - overflow;
- block += overflow;
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else {
- block -= overflow;
- count += overflow;
- }
- }
- overflow = EXT4_LBLK_COFF(sbi, count);
- if (overflow) {
- if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
- if (count > overflow)
- count -= overflow;
- else
- return;
- } else
- count += sbi->s_cluster_ratio - overflow;
- }
-
- if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
- int i;
- int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
-
- for (i = 0; i < count; i++) {
- cond_resched();
- if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
- ext4_forget(handle, is_metadata, inode, bh, block + i);
- }
- }
-
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -5969,13 +5937,7 @@ do_more:
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
- in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
- in_range(block, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, gdp),
- sbi->s_itb_per_group)) {
-
+ if (!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks in system zone - "
"Block = %llu, count = %lu", block, count);
/* err = 0. ext4_std_error should be a no op */
@@ -6046,7 +6008,7 @@ do_more:
NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
- " group:%d block:%d count:%lu failed"
+ " group:%u block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
} else
@@ -6107,6 +6069,103 @@ error_return:
}
/**
+ * ext4_free_blocks() -- Free given blocks and update quota
+ * @handle: handle for this transaction
+ * @inode: inode
+ * @bh: optional buffer of the block to be freed
+ * @block: starting physical block to be freed
+ * @count: number of blocks to be freed
+ * @flags: flags used by ext4_free_blocks
+ */
+void ext4_free_blocks(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, ext4_fsblk_t block,
+ unsigned long count, int flags)
+{
+ struct super_block *sb = inode->i_sb;
+ unsigned int overflow;
+ struct ext4_sb_info *sbi;
+
+ sbi = EXT4_SB(sb);
+
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, count);
+ return;
+ }
+
+ might_sleep();
+ if (bh) {
+ if (block)
+ BUG_ON(block != bh->b_blocknr);
+ else
+ block = bh->b_blocknr;
+ }
+
+ if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
+ !ext4_inode_block_valid(inode, block, count)) {
+ ext4_error(sb, "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
+ return;
+ }
+
+ ext4_debug("freeing block %llu\n", block);
+ trace_ext4_free_blocks(inode, block, count, flags);
+
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
+
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
+ }
+
+ /*
+ * If the extent to be freed does not begin on a cluster
+ * boundary, we need to deal with partial clusters at the
+ * beginning and end of the extent. Normally we will free
+ * blocks at the beginning or the end unless we are explicitly
+ * requested to avoid doing so.
+ */
+ overflow = EXT4_PBLK_COFF(sbi, block);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
+ overflow = sbi->s_cluster_ratio - overflow;
+ block += overflow;
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else {
+ block -= overflow;
+ count += overflow;
+ }
+ }
+ overflow = EXT4_LBLK_COFF(sbi, count);
+ if (overflow) {
+ if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
+ if (count > overflow)
+ count -= overflow;
+ else
+ return;
+ } else
+ count += sbi->s_cluster_ratio - overflow;
+ }
+
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+ int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ if (is_metadata)
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ ext4_forget(handle, is_metadata, inode, bh, block + i);
+ }
+ }
+
+ ext4_mb_clear_bb(handle, inode, block, count, flags);
+ return;
+}
+
+/**
* ext4_group_add_blocks() -- Add given blocks to an existing group
* @handle: handle to this transaction
* @sb: super block
@@ -6162,11 +6221,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
goto error_return;
}
- if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
- in_range(ext4_inode_bitmap(sb, desc), block, count) ||
- in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
- in_range(block + count - 1, ext4_inode_table(sb, desc),
- sbi->s_itb_per_group)) {
+ if (!ext4_sb_block_valid(sb, NULL, block, count)) {
ext4_error(sb, "Adding blocks in system zones - "
"Block = %llu, count = %lu",
block, count);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index ff8916e1d38e..7a5353a8cfd7 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -485,7 +485,7 @@ int ext4_ext_migrate(struct inode *inode)
* when we add extents we extent the journal
*/
/*
- * Even though we take i_mutex we can still cause block
+ * Even though we take i_rwsem we can still cause block
* allocation via mmap write to holes. If we have allocated
* new blocks we fail migrate. New block allocation will
* clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 52c9bd154122..e37da8d5cd0c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1317,7 +1317,7 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
dx_set_count(entries, count + 1);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for. If quick is set, assume the name being looked up
@@ -1428,7 +1428,7 @@ static bool ext4_match(struct inode *parent,
f.crypto_buf = fname->crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) &&
(!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
if (fname->cf_name.name) {
@@ -1800,7 +1800,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
}
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (!inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
@@ -2308,7 +2308,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
@@ -2997,14 +2997,14 @@ bool ext4_empty_dir(struct inode *inode)
if (inode->i_size < ext4_dir_rec_len(1, NULL) +
ext4_dir_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size");
- return true;
+ return false;
}
/* The first directory block must not be a hole,
* so treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
- return true;
+ return false;
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
@@ -3012,7 +3012,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
- return true;
+ return false;
}
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize);
@@ -3021,7 +3021,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
- return true;
+ return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
@@ -3035,7 +3035,7 @@ bool ext4_empty_dir(struct inode *inode)
continue;
}
if (IS_ERR(bh))
- return true;
+ return false;
}
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
@@ -3126,7 +3126,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
ext4_fc_track_unlink(handle, dentry);
retval = ext4_mark_inode_dirty(handle, dir);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -3231,7 +3231,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = __ext4_unlink(handle, dir, &dentry->d_name, d_inode(dentry));
if (!retval)
ext4_fc_track_unlink(handle, dentry);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -3889,14 +3889,21 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
* dirents in directories.
*/
ext4_fc_mark_ineligible(old.inode->i_sb,
- EXT4_FC_REASON_RENAME_DIR);
+ EXT4_FC_REASON_RENAME_DIR, handle);
} else {
+ struct super_block *sb = old.inode->i_sb;
+
if (new.inode)
ext4_fc_track_unlink(handle, new.dentry);
- __ext4_fc_track_link(handle, old.inode, new.dentry);
- __ext4_fc_track_unlink(handle, old.inode, old.dentry);
- if (whiteout)
- __ext4_fc_track_create(handle, whiteout, old.dentry);
+ if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
+ !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
+ !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) {
+ __ext4_fc_track_link(handle, old.inode, new.dentry);
+ __ext4_fc_track_unlink(handle, old.inode, old.dentry);
+ if (whiteout)
+ __ext4_fc_track_create(handle, whiteout,
+ old.dentry);
+ }
}
if (new.inode) {
@@ -4049,7 +4056,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(retval))
goto end_rename;
ext4_fc_mark_ineligible(new.inode->i_sb,
- EXT4_FC_REASON_CROSS_RENAME);
+ EXT4_FC_REASON_CROSS_RENAME, handle);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index 53adc8f570a3..7de0612eb42d 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -93,7 +93,7 @@ static int ext4_orphan_file_add(handle_t *handle, struct inode *inode)
* At filesystem recovery time, we walk this list deleting unlinked
* inodes and truncating linked inodes in ext4_orphan_cleanup().
*
- * Orphan list manipulation functions must be called under i_mutex unless
+ * Orphan list manipulation functions must be called under i_rwsem unless
* we are just creating the inode or deleting it.
*/
int ext4_orphan_add(handle_t *handle, struct inode *inode)
@@ -119,7 +119,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
/*
* Orphan handling is only valid for files with data blocks
* being truncated, or files being unlinked. Note that we either
- * hold i_mutex, or the inode can not be referenced from outside,
+ * hold i_rwsem, or the inode can not be referenced from outside,
* so i_nlink should not be bumped due to race
*/
ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 35ada7baf41e..495ce59fb4ad 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -323,10 +323,9 @@ static void ext4_end_bio(struct bio *bio)
{
ext4_io_end_t *io_end = bio->bi_private;
sector_t bi_sector = bio->bi_iter.bi_sector;
- char b[BDEVNAME_SIZE];
- if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n",
- bio_devname(bio, b),
+ if (WARN_ONCE(!io_end, "io_end is NULL: %pg: sector %Lu len %u err %d\n",
+ bio->bi_bdev,
(long long) bio->bi_iter.bi_sector,
(unsigned) bio_sectors(bio),
bio->bi_status)) {
@@ -374,7 +373,6 @@ void ext4_io_submit(struct ext4_io_submit *io)
if (bio) {
if (io->io_wbc->sync_mode == WB_SYNC_ALL)
io->io_bio->bi_opf |= REQ_SYNC;
- io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
submit_bio(io->io_bio);
}
io->io_bio = NULL;
@@ -419,10 +417,8 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
submit_and_retry:
ext4_io_submit(io);
}
- if (io->io_bio == NULL) {
+ if (io->io_bio == NULL)
io_submit_init_bio(io, bh);
- io->io_bio->bi_write_hint = inode->i_write_hint;
- }
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index ee8f02f406cb..90a941d20dff 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -14,6 +14,7 @@
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/jiffies.h>
#include "ext4_jbd2.h"
@@ -483,7 +484,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
}
ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
first_cluster, first_cluster - start, count2);
- ext4_set_bits(bh->b_data, first_cluster - start, count2);
+ mb_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -632,7 +633,7 @@ handle_bb:
if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n",
start);
- ext4_set_bits(bh->b_data, 0,
+ mb_set_bits(bh->b_data, 0,
EXT4_NUM_B2C(sbi, overhead));
}
ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
@@ -2100,7 +2101,7 @@ retry:
*/
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) {
- if (jiffies - last_update_time > HZ * 10) {
+ if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
"resized to %llu blocks",
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eee0d9ebfa6c..81749eaddf4c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1301,7 +1301,7 @@ static void ext4_put_super(struct super_block *sb)
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
@@ -1316,7 +1316,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
{
struct ext4_inode_info *ei;
- ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
@@ -1961,7 +1961,7 @@ static const struct mount_opts {
{Opt_err, 0, 0}
};
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
static const struct ext4_sb_encodings {
__u16 magic;
char *name;
@@ -2021,12 +2021,12 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
#define EXT4_SPEC_s_commit_interval (1 << 16)
#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
#define EXT4_SPEC_s_sb_block (1 << 18)
+#define EXT4_SPEC_mb_optimize_scan (1 << 19)
struct ext4_fs_context {
char *s_qf_names[EXT4_MAXQUOTAS];
char *test_dummy_enc_arg;
int s_jquota_fmt; /* Format of quota to use */
- int mb_optimize_scan;
#ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay;
#endif
@@ -2045,8 +2045,8 @@ struct ext4_fs_context {
unsigned int mask_s_mount_opt;
unsigned int vals_s_mount_opt2;
unsigned int mask_s_mount_opt2;
- unsigned int vals_s_mount_flags;
- unsigned int mask_s_mount_flags;
+ unsigned long vals_s_mount_flags;
+ unsigned long mask_s_mount_flags;
unsigned int opt_flags; /* MOPT flags */
unsigned int spec;
u32 s_max_batch_time;
@@ -2149,23 +2149,36 @@ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name |= flag; \
-} \
+}
+
+#define EXT4_CLEAR_CTX(name) \
static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name &= ~flag; \
-} \
+}
+
+#define EXT4_TEST_CTX(name) \
static inline unsigned long \
ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
return (ctx->vals_s_##name & flag); \
-} \
+}
-EXT4_SET_CTX(flags);
+EXT4_SET_CTX(flags); /* set only */
EXT4_SET_CTX(mount_opt);
+EXT4_CLEAR_CTX(mount_opt);
+EXT4_TEST_CTX(mount_opt);
EXT4_SET_CTX(mount_opt2);
-EXT4_SET_CTX(mount_flags);
+EXT4_CLEAR_CTX(mount_opt2);
+EXT4_TEST_CTX(mount_opt2);
+
+static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
+{
+ set_bit(bit, &ctx->mask_s_mount_flags);
+ set_bit(bit, &ctx->vals_s_mount_flags);
+}
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
@@ -2235,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
param->key);
return 0;
case Opt_abort:
- ctx_set_mount_flags(ctx, EXT4_MF_FS_ABORTED);
+ ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
return 0;
case Opt_i_version:
ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20");
@@ -2451,12 +2464,17 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx_clear_mount_opt(ctx, m->mount_opt);
return 0;
case Opt_mb_optimize_scan:
- if (result.int_32 != 0 && result.int_32 != 1) {
+ if (result.int_32 == 1) {
+ ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else if (result.int_32 == 0) {
+ ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
+ ctx->spec |= EXT4_SPEC_mb_optimize_scan;
+ } else {
ext4_msg(NULL, KERN_WARNING,
"mb_optimize_scan should be set to 0 or 1.");
return -EINVAL;
}
- ctx->mb_optimize_scan = result.int_32;
return 0;
}
@@ -3468,8 +3486,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
- unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS;
+ loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks;
+ unsigned int ppb = 1 << (bits - 2);
/*
* This is calculated to be the largest file size for a dense, block
@@ -3501,27 +3520,42 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
}
+ /* Compute how many blocks we can address by block tree */
+ res += ppb;
+ res += ppb * ppb;
+ res += ((loff_t)ppb) * ppb * ppb;
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
+ /* Does block tree limit file size? */
+ if (res + meta_blocks <= upper_limit)
+ goto check_lfs;
+
+ res = upper_limit;
+ /* How many metadata blocks are needed for addressing upper_limit? */
+ upper_limit -= EXT4_NDIR_BLOCKS;
/* indirect blocks */
meta_blocks = 1;
+ upper_limit -= ppb;
/* double indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2));
- /* tripple indirect blocks */
- meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
-
- upper_limit -= meta_blocks;
- upper_limit <<= bits;
-
- res += 1LL << (bits-2);
- res += 1LL << (2*(bits-2));
- res += 1LL << (3*(bits-2));
+ if (upper_limit < ppb * ppb) {
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
+ res -= meta_blocks;
+ goto check_lfs;
+ }
+ meta_blocks += 1 + ppb;
+ upper_limit -= ppb * ppb;
+ /* tripple indirect blocks for the rest */
+ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
+ DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
+ res -= meta_blocks;
+check_lfs:
res <<= bits;
- if (res > upper_limit)
- res = upper_limit;
-
if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE;
- return (loff_t)res;
+ return res;
}
static ext4_fsblk_t descriptor_loc(struct super_block *sb,
@@ -3606,7 +3640,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_UNICODE
+#if !IS_ENABLED(CONFIG_UNICODE)
if (ext4_has_feature_casefold(sb)) {
ext4_msg(sb, KERN_ERR,
"Filesystem with casefold feature cannot be "
@@ -4369,7 +4403,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* Set defaults for the variables that will be set during parsing */
ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
- ctx->mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start =
@@ -4610,7 +4643,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
if (err < 0)
goto failed_mount;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (ext4_has_feature_casefold(sb) && !sb->s_encoding) {
const struct ext4_sb_encodings *encoding_info;
struct unicode_map *encoding;
@@ -5082,7 +5115,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]);
sbi->s_fc_bytes = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
+ sbi->s_fc_ineligible_tid = 0;
spin_lock_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
@@ -5320,12 +5353,12 @@ no_journal:
* turned off by passing "mb_optimize_scan=0". This can also be
* turned on forcefully by passing "mb_optimize_scan=1".
*/
- if (ctx->mb_optimize_scan == 1)
- set_opt2(sb, MB_OPTIMIZE_SCAN);
- else if (ctx->mb_optimize_scan == 0)
- clear_opt2(sb, MB_OPTIMIZE_SCAN);
- else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
- set_opt2(sb, MB_OPTIMIZE_SCAN);
+ if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
+ if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
+ set_opt2(sb, MB_OPTIMIZE_SCAN);
+ else
+ clear_opt2(sb, MB_OPTIMIZE_SCAN);
+ }
err = ext4_mb_init(sb);
if (err) {
@@ -5514,7 +5547,7 @@ failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
@@ -5540,7 +5573,7 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc)
sbi = ext4_alloc_sbi(sb);
if (!sbi)
- ret = -ENOMEM;
+ return -ENOMEM;
fc->s_fs_info = sbi;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index f61e65ae27d8..d233c24ea342 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -309,7 +309,7 @@ EXT4_ATTR_FEATURE(meta_bg_resize);
EXT4_ATTR_FEATURE(encryption);
EXT4_ATTR_FEATURE(test_dummy_encryption_v2);
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
EXT4_ATTR_FEATURE(casefold);
#endif
#ifdef CONFIG_FS_VERITY
@@ -317,7 +317,7 @@ EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
EXT4_ATTR_FEATURE(fast_commit);
-#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
EXT4_ATTR_FEATURE(encrypted_casefold);
#endif
@@ -329,7 +329,7 @@ static struct attribute *ext4_feat_attrs[] = {
ATTR_LIST(encryption),
ATTR_LIST(test_dummy_encryption_v2),
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(casefold),
#endif
#ifdef CONFIG_FS_VERITY
@@ -337,7 +337,7 @@ static struct attribute *ext4_feat_attrs[] = {
#endif
ATTR_LIST(metadata_csum_seed),
ATTR_LIST(fast_commit),
-#if defined(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
+#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
ATTR_LIST(encrypted_casefold),
#endif
NULL,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1e0fc1ed845b..042325349098 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2408,7 +2408,7 @@ retry_inode:
if (IS_SYNC(inode))
ext4_handle_sync(handle);
}
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
cleanup:
brelse(is.iloc.bh);
@@ -2486,7 +2486,7 @@ retry:
if (error == 0)
error = error2;
}
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, NULL);
return error;
}
@@ -2920,7 +2920,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
error);
goto cleanup;
}
- ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR);
+ ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle);
}
error = 0;
cleanup:
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index f46a7339d6cf..03ef087537c7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -143,3 +143,10 @@ config F2FS_IOSTAT
Support getting IO statistics through sysfs and printing out periodic
IO statistics tracepoint events. You have to turn on "iostat_enable"
sysfs node to enable this feature.
+
+config F2FS_UNFAIR_RWSEM
+ bool "F2FS unfair rw_semaphore"
+ depends on F2FS_FS && BLK_CGROUP
+ help
+ Use unfair rw_semaphore, if system configured IO priority by block
+ cgroup.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 16e826e01f09..eaa240b21f07 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -204,8 +204,9 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu)
return __f2fs_get_acl(inode, type, NULL);
}
-static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
- struct posix_acl **acl)
+static int f2fs_acl_update_mode(struct user_namespace *mnt_userns,
+ struct inode *inode, umode_t *mode_p,
+ struct posix_acl **acl)
{
umode_t mode = inode->i_mode;
int error;
@@ -218,14 +219,15 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) &&
- !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
+ if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
return 0;
}
-static int __f2fs_set_acl(struct inode *inode, int type,
+static int __f2fs_set_acl(struct user_namespace *mnt_userns,
+ struct inode *inode, int type,
struct posix_acl *acl, struct page *ipage)
{
int name_index;
@@ -238,7 +240,8 @@ static int __f2fs_set_acl(struct inode *inode, int type,
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = f2fs_acl_update_mode(inode, &mode, &acl);
+ error = f2fs_acl_update_mode(mnt_userns, inode,
+ &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
@@ -279,7 +282,7 @@ int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
return -EIO;
- return __f2fs_set_acl(inode, type, acl, NULL);
+ return __f2fs_set_acl(mnt_userns, inode, type, acl, NULL);
}
/*
@@ -419,7 +422,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
- error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl,
ipage);
posix_acl_release(default_acl);
} else {
@@ -427,7 +430,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
}
if (acl) {
if (!error)
- error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
} else {
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 982f0170639f..a8fc4fa511a8 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -98,6 +98,13 @@ repeat:
}
if (unlikely(!PageUptodate(page))) {
+ if (page->index == sbi->metapage_eio_ofs &&
+ sbi->metapage_eio_cnt++ == MAX_RETRY_META_PAGE_EIO) {
+ set_ckpt_flags(sbi, CP_ERROR_FLAG);
+ } else {
+ sbi->metapage_eio_ofs = page->index;
+ sbi->metapage_eio_cnt = 0;
+ }
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -282,18 +289,22 @@ out:
return blkno - start;
}
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks)
{
struct page *page;
bool readahead = false;
+ if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
+ return;
+
page = find_get_page(META_MAPPING(sbi), index);
if (!page || !PageUptodate(page))
readahead = true;
f2fs_put_page(page, 0);
if (readahead)
- f2fs_ra_meta_pages(sbi, index, BIO_MAX_VECS, META_POR, true);
+ f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
}
static int __f2fs_write_meta_page(struct page *page,
@@ -351,13 +362,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
goto skip_write;
/* if locked failed, cp will flush dirty pages instead */
- if (!down_write_trylock(&sbi->cp_global_sem))
+ if (!f2fs_down_write_trylock(&sbi->cp_global_sem))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -436,26 +447,27 @@ stop:
return nwritten;
}
-static int f2fs_set_meta_page_dirty(struct page *page)
+static bool f2fs_dirty_meta_folio(struct address_space *mapping,
+ struct folio *folio)
{
- trace_f2fs_set_page_dirty(page, META);
-
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
- set_page_private_reference(page);
- return 1;
+ trace_f2fs_set_page_dirty(&folio->page, META);
+
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ inc_page_count(F2FS_P_SB(&folio->page), F2FS_DIRTY_META);
+ set_page_private_reference(&folio->page);
+ return true;
}
- return 0;
+ return false;
}
const struct address_space_operations f2fs_meta_aops = {
.writepage = f2fs_write_meta_page,
.writepages = f2fs_write_meta_pages,
- .set_page_dirty = f2fs_set_meta_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_meta_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
@@ -864,6 +876,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@@ -871,15 +884,16 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (err)
return NULL;
- if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
- sbi->blocks_per_seg) {
+ cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
+
+ if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
}
pre_version = *version;
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
&cp_page_2, version);
if (err)
@@ -1014,7 +1028,7 @@ static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
stat_dec_dirty_inode(F2FS_I_SB(inode), type);
}
-void f2fs_update_dirty_page(struct inode *inode, struct page *page)
+void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
@@ -1029,7 +1043,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- set_page_private_reference(page);
+ set_page_private_reference(&folio->page);
}
void f2fs_remove_dirty_inode(struct inode *inode)
@@ -1159,7 +1173,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
if (!is_journalled_quota(sbi))
return false;
- if (!down_write_trylock(&sbi->quota_sem))
+ if (!f2fs_down_write_trylock(&sbi->quota_sem))
return true;
if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) {
ret = false;
@@ -1171,7 +1185,7 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi)
} else if (get_pages(sbi, F2FS_DIRTY_QDATA)) {
ret = true;
}
- up_write(&sbi->quota_sem);
+ f2fs_up_write(&sbi->quota_sem);
return ret;
}
@@ -1228,10 +1242,10 @@ retry_flush_dents:
* POR: we should ensure that there are no dirty node pages
* until finishing nat/sit flush. inode->i_blocks can be updated.
*/
- down_write(&sbi->node_change);
+ f2fs_down_write(&sbi->node_change);
if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
err = f2fs_sync_inode_meta(sbi);
if (err)
@@ -1241,15 +1255,15 @@ retry_flush_dents:
}
retry_flush_nodes:
- down_write(&sbi->node_write);
+ f2fs_down_write(&sbi->node_write);
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
atomic_inc(&sbi->wb_sync_req[NODE]);
err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
atomic_dec(&sbi->wb_sync_req[NODE]);
if (err) {
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
return err;
}
@@ -1262,13 +1276,13 @@ retry_flush_nodes:
* dirty node blocks and some checkpoint values by block allocation.
*/
__prepare_cp_block(sbi);
- up_write(&sbi->node_change);
+ f2fs_up_write(&sbi->node_change);
return err;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->node_write);
+ f2fs_up_write(&sbi->node_write);
f2fs_unlock_all(sbi);
}
@@ -1543,6 +1557,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
+ percpu_counter_set(&sbi->rf_node_block_count, 0);
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
@@ -1612,7 +1627,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_warn(sbi, "Start checkpoint disabled!");
}
if (cpc->reason != CP_RESIZE)
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
@@ -1693,7 +1708,7 @@ stop:
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
if (cpc->reason != CP_RESIZE)
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
return err;
}
@@ -1741,9 +1756,9 @@ static int __write_checkpoint_sync(struct f2fs_sb_info *sbi)
struct cp_control cpc = { .reason = CP_SYNC, };
int err;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return err;
}
@@ -1831,9 +1846,9 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
int ret;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
ret = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
return ret;
}
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d0c3aeba5945..12a56f9e1572 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -314,10 +314,9 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, "
+ printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
"expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
- dic->rlen,
+ F2FS_I_SB(dic->inode)->sb->s_id, ret,
PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
@@ -1267,7 +1266,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
* checkpoint. This can only happen to quota writes which can cause
* the below discard race condition.
*/
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
} else if (!f2fs_trylock_op(sbi)) {
goto out_free;
}
@@ -1384,7 +1383,7 @@ unlock_continue:
f2fs_put_dnode(&dn);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
@@ -1410,7 +1409,7 @@ out_put_dnode:
f2fs_put_dnode(&dn);
out_unlock_op:
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
else
f2fs_unlock_op(sbi);
out_free:
@@ -1505,9 +1504,7 @@ continue_unlock:
if (IS_NOQUOTA(cc->inode))
return 0;
ret = 0;
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry_write;
}
return ret;
@@ -1750,7 +1747,7 @@ unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
const struct address_space_operations f2fs_compress_aops = {
.releasepage = f2fs_release_page,
- .invalidatepage = f2fs_invalidate_page,
+ .invalidate_folio = f2fs_invalidate_folio,
};
struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fc077bce679d..f8fcbe91059b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -428,8 +428,6 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
} else {
bio->bi_end_io = f2fs_write_end_io;
bio->bi_private = sbi;
- bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
- fio->type, fio->temp);
}
iostat_alloc_and_bind_ctx(sbi, bio, NULL);
@@ -584,7 +582,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
/* change META to META_FLUSH in the checkpoint procedure */
if (type >= META_FLUSH) {
@@ -594,7 +592,7 @@ static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
io->bio->bi_opf |= REQ_PREFLUSH | REQ_FUA;
}
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
@@ -609,9 +607,9 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
enum page_type btype = PAGE_TYPE_OF_BIO(type);
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
- down_read(&io->io_rwsem);
+ f2fs_down_read(&io->io_rwsem);
ret = __has_merged_page(io->bio, inode, page, ino);
- up_read(&io->io_rwsem);
+ f2fs_up_read(&io->io_rwsem);
}
if (ret)
__f2fs_submit_merged_write(sbi, type, temp);
@@ -732,9 +730,9 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio,
if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE)
f2fs_bug_on(sbi, 1);
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_add_tail(&be->list, &io->bio_list);
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
static void del_bio_entry(struct bio_entry *be)
@@ -756,7 +754,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct list_head *head = &io->bio_list;
struct bio_entry *be;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (be->bio != *bio)
continue;
@@ -780,7 +778,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
__submit_bio(sbi, *bio, DATA);
break;
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (ret) {
@@ -806,7 +804,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (list_empty(head))
continue;
- down_read(&io->bio_list_lock);
+ f2fs_down_read(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -816,14 +814,14 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
if (found)
break;
}
- up_read(&io->bio_list_lock);
+ f2fs_up_read(&io->bio_list_lock);
if (!found)
continue;
found = false;
- down_write(&io->bio_list_lock);
+ f2fs_down_write(&io->bio_list_lock);
list_for_each_entry(be, head, list) {
if (target)
found = (target == be->bio);
@@ -836,7 +834,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
break;
}
}
- up_write(&io->bio_list_lock);
+ f2fs_up_write(&io->bio_list_lock);
}
if (found)
@@ -894,7 +892,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
f2fs_bug_on(sbi, is_read_io(fio->op));
- down_write(&io->io_rwsem);
+ f2fs_down_write(&io->io_rwsem);
next:
if (fio->in_list) {
spin_lock(&io->io_lock);
@@ -961,7 +959,7 @@ out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
- up_write(&io->io_rwsem);
+ f2fs_up_write(&io->io_rwsem);
}
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
@@ -1371,9 +1369,9 @@ void f2fs_do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
{
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (lock)
- down_read(&sbi->node_change);
+ f2fs_down_read(&sbi->node_change);
else
- up_read(&sbi->node_change);
+ f2fs_up_read(&sbi->node_change);
} else {
if (lock)
f2fs_lock_op(sbi);
@@ -2448,6 +2446,9 @@ static inline bool check_inplace_update_policy(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int policy = SM_I(sbi)->ipu_policy;
+ if (policy & (0x1 << F2FS_IPU_HONOR_OPU_WRITE) &&
+ is_inode_flag_set(inode, FI_OPU_WRITE))
+ return false;
if (policy & (0x1 << F2FS_IPU_FORCE))
return true;
if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi))
@@ -2518,6 +2519,9 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
return true;
+ if (is_inode_flag_set(inode, FI_OPU_WRITE))
+ return true;
+
if (fio) {
if (page_private_gcing(fio->page))
return true;
@@ -2737,13 +2741,13 @@ write:
* the below discard race condition.
*/
if (IS_NOQUOTA(inode))
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
if (IS_NOQUOTA(inode))
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto done;
}
@@ -3035,8 +3039,7 @@ result:
} else if (ret == -EAGAIN) {
ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
- cond_resched();
- congestion_wait(BLK_RW_ASYNC,
+ f2fs_io_schedule_timeout(
DEFAULT_IO_TIMEOUT);
goto retry_write;
}
@@ -3142,8 +3145,8 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
- /* skip writing during file defragment */
- if (is_inode_flag_set(inode, FI_DO_DEFRAG))
+ /* skip writing in file defragment preparing stage */
+ if (is_inode_flag_set(inode, FI_SKIP_WRITES))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, DATA);
@@ -3151,8 +3154,12 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[DATA]);
- else if (atomic_read(&sbi->wb_sync_req[DATA]))
+ else if (atomic_read(&sbi->wb_sync_req[DATA])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
if (__should_serialize_io(inode, wbc)) {
mutex_lock(&sbi->writepages);
@@ -3201,14 +3208,14 @@ void f2fs_write_failed(struct inode *inode, loff_t to)
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
if (to > i_size && !f2fs_verity_in_progress(inode)) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache(inode, i_size);
f2fs_truncate_blocks(inode, i_size, true);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -3341,7 +3348,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
*fsdata = NULL;
- if (len == PAGE_SIZE)
+ if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode)))
goto repeat;
ret = f2fs_prepare_compress_overwrite(inode, pagep,
@@ -3480,17 +3487,16 @@ unlock_out:
return copied;
}
-void f2fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
- (offset % PAGE_SIZE || length != PAGE_SIZE))
+ (offset || length != folio_size(folio)))
return;
- if (PageDirty(page)) {
+ if (folio_test_dirty(folio)) {
if (inode->i_ino == F2FS_META_INO(sbi)) {
dec_page_count(sbi, F2FS_DIRTY_META);
} else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
@@ -3501,17 +3507,16 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
}
}
- clear_page_private_gcing(page);
+ clear_page_private_gcing(&folio->page);
if (test_opt(sbi, COMPRESS_CACHE) &&
inode->i_ino == F2FS_COMPRESS_INO(sbi))
- clear_page_private_data(page);
+ clear_page_private_data(&folio->page);
- if (page_private_atomic(page))
- return f2fs_drop_inmem_page(inode, page);
+ if (page_private_atomic(&folio->page))
+ return f2fs_drop_inmem_page(inode, &folio->page);
- detach_page_private(page);
- set_page_private(page, 0);
+ folio_detach_private(folio);
}
int f2fs_release_page(struct page *page, gfp_t wait)
@@ -3538,35 +3543,35 @@ int f2fs_release_page(struct page *page, gfp_t wait)
return 1;
}
-static int f2fs_set_data_page_dirty(struct page *page)
+static bool f2fs_dirty_data_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = mapping->host;
- trace_f2fs_set_page_dirty(page, DATA);
+ trace_f2fs_set_page_dirty(&folio->page, DATA);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (PageSwapCache(page))
- return __set_page_dirty_nobuffers(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ BUG_ON(folio_test_swapcache(folio));
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
- if (!page_private_atomic(page)) {
- f2fs_register_inmem_page(inode, page);
- return 1;
+ if (!page_private_atomic(&folio->page)) {
+ f2fs_register_inmem_page(inode, &folio->page);
+ return true;
}
/*
* Previously, this page has been registered, we just
* return here.
*/
- return 0;
+ return false;
}
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- f2fs_update_dirty_page(inode, page);
- return 1;
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ f2fs_update_dirty_folio(inode, folio);
+ return true;
}
- return 0;
+ return true;
}
@@ -3709,19 +3714,20 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
unsigned int end_sec = secidx + blkcnt / blk_per_sec;
int ret = 0;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
set_inode_flag(inode, FI_ALIGNED_WRITE);
+ set_inode_flag(inode, FI_OPU_WRITE);
for (; secidx < end_sec; secidx++) {
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
f2fs_unlock_op(sbi);
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
struct page *page;
@@ -3729,7 +3735,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
page = f2fs_get_lock_data_page(inode, blkidx, true);
if (IS_ERR(page)) {
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
ret = PTR_ERR(page);
goto done;
}
@@ -3738,22 +3744,23 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
f2fs_put_page(page, 1);
}
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
ret = filemap_fdatawrite(inode->i_mapping);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
if (ret)
break;
}
done:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
+ clear_inode_flag(inode, FI_OPU_WRITE);
clear_inode_flag(inode, FI_ALIGNED_WRITE);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -3926,8 +3933,8 @@ const struct address_space_operations f2fs_dblock_aops = {
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
- .set_page_dirty = f2fs_set_data_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_data_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
.direct_IO = noop_direct_IO,
.bmap = f2fs_bmap,
@@ -4032,6 +4039,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->offset = blks_to_bytes(inode, map.m_lblk);
+ /*
+ * When inline encryption is enabled, sometimes I/O to an encrypted file
+ * has to be broken up to guarantee DUN contiguity. Handle this by
+ * limiting the length of the mapping returned.
+ */
+ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+
if (map.m_flags & (F2FS_MAP_MAPPED | F2FS_MAP_UNWRITTEN)) {
iomap->length = blks_to_bytes(inode, map.m_len);
if (map.m_flags & F2FS_MAP_MAPPED) {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 8c50518475a9..fcdf253cd211 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -21,7 +21,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static DEFINE_MUTEX(f2fs_stat_mutex);
+static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
#ifdef CONFIG_DEBUG_FS
static struct dentry *f2fs_debugfs_root;
#endif
@@ -338,14 +338,16 @@ static char *s_flag[] = {
[SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush",
[SBI_QUOTA_NEED_REPAIR] = " quota_need_repair",
[SBI_IS_RESIZEFS] = " resizefs",
+ [SBI_IS_FREEZING] = " freezefs",
};
static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
int i = 0, j = 0;
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
update_general_status(si->sbi);
@@ -474,12 +476,14 @@ static int stat_show(struct seq_file *s, void *v)
si->node_segs, si->bg_node_segs);
seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
"Idle Greedy (%d), Idle AT (%d), "
- "Urgent High (%d), Urgent Low (%d)\n",
+ "Urgent High (%d), Urgent Mid (%d), "
+ "Urgent Low (%d)\n",
si->sbi->gc_reclaimed_segs[GC_NORMAL],
si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
+ si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
@@ -532,6 +536,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ndirty_meta, si->meta_pages);
seq_printf(s, " - imeta: %4d\n",
si->ndirty_imeta);
+ seq_printf(s, " - fsync mark: %4lld\n",
+ percpu_counter_sum_positive(
+ &si->sbi->rf_node_block_count));
seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n",
@@ -573,7 +580,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -584,6 +591,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
+ unsigned long flags;
int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
@@ -619,9 +627,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->max_aw_cnt, 0);
atomic_set(&sbi->max_vw_cnt, 0);
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_add_tail(&si->stat_list, &f2fs_stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
return 0;
}
@@ -629,10 +637,11 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
+ unsigned long flags;
- mutex_lock(&f2fs_stat_mutex);
+ raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
list_del(&si->stat_list);
- mutex_unlock(&f2fs_stat_mutex);
+ raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
kfree(si);
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 1820e9c106f7..a0e51937d92e 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -16,7 +16,7 @@
#include "xattr.h"
#include <trace/events/f2fs.h>
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
@@ -79,7 +79,7 @@ unsigned char f2fs_get_de_type(struct f2fs_dir_entry *de)
int f2fs_init_casefolded_name(const struct inode *dir,
struct f2fs_filename *fname)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = dir->i_sb;
if (IS_CASEFOLDED(dir)) {
@@ -174,7 +174,7 @@ void f2fs_free_filename(struct f2fs_filename *fname)
kfree(fname->crypto_buf.name);
fname->crypto_buf.name = NULL;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (fname->cf_name.name) {
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
@@ -208,7 +208,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
return f2fs_find_target_dentry(&d, fname, max_slots);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* Test whether a case-insensitive directory entry matches the filename
* being searched for.
@@ -266,7 +266,7 @@ static inline int f2fs_match_name(const struct inode *dir,
{
struct fscrypt_name f;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (fname->cf_name.name) {
struct qstr cf = FSTR_TO_QSTR(&fname->cf_name);
@@ -766,7 +766,7 @@ add_dentry:
f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -793,7 +793,7 @@ add_dentry:
f2fs_update_parent_metadata(dir, inode, current_depth);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
f2fs_put_page(dentry_page, 1);
@@ -858,7 +858,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
struct page *page;
int err = 0;
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -869,7 +869,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
clear_inode_flag(inode, FI_NEW_INODE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
fail:
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return err;
}
@@ -877,7 +877,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
@@ -888,7 +888,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
f2fs_i_links_write(inode, false);
f2fs_i_size_write(inode, 0);
}
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
if (inode->i_nlink == 0)
f2fs_add_orphan_inode(inode);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 37faae388fe0..cd1e65bcf0b0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -123,6 +123,20 @@ typedef u32 nid_t;
#define COMPRESS_EXT_NUM 16
+/*
+ * An implementation of an rwsem that is explicitly unfair to readers. This
+ * prevents priority inversion when a low-priority reader acquires the read lock
+ * while sleeping on the write lock but the write lock is needed by
+ * higher-priority clients.
+ */
+
+struct f2fs_rwsem {
+ struct rw_semaphore internal_rwsem;
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_queue_head_t read_waiters;
+#endif
+};
+
struct f2fs_mount_info {
unsigned int opt;
int write_io_size_bits; /* Write IO size bits */
@@ -386,6 +400,10 @@ struct discard_cmd_control {
struct mutex cmd_lock;
unsigned int nr_discards; /* # of discards in the list */
unsigned int max_discards; /* max. discards to be issued */
+ unsigned int max_discard_request; /* max. discard request per round */
+ unsigned int min_discard_issue_time; /* min. interval between discard issue */
+ unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
+ unsigned int max_discard_issue_time; /* max. interval between discard issue */
unsigned int discard_granularity; /* discard granularity */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
@@ -488,7 +506,7 @@ struct f2fs_filename {
*/
struct fscrypt_str crypto_buf;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* For casefolded directories: the casefolded name, but it's left NULL
* if the original name is not valid Unicode, if the directory is both
@@ -561,6 +579,9 @@ enum {
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
+/* maximum retry of EIO'ed meta page */
+#define MAX_RETRY_META_PAGE_EIO 100
+
#define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */
#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
@@ -574,6 +595,9 @@ enum {
/* number of extent info in extent cache we try to shrink */
#define EXTENT_CACHE_SHRINK_NUMBER 128
+#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS 1
+
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
union {
@@ -721,7 +745,8 @@ enum {
FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
- FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_SKIP_WRITES, /* should skip data page writeback */
+ FI_OPU_WRITE, /* used for opu per file */
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
FI_PREALLOCATED_ALL, /* all blocks for write were preallocated */
FI_HOT_DATA, /* indicate file is hot */
@@ -752,7 +777,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
- struct rw_semaphore i_sem; /* protect fi info */
+ struct f2fs_rwsem i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
@@ -777,8 +802,8 @@ struct f2fs_inode_info {
struct extent_tree *extent_tree; /* cached extent_tree entry */
/* avoid racing between foreground op and gc */
- struct rw_semaphore i_gc_rwsem[2];
- struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
+ struct f2fs_rwsem i_gc_rwsem[2];
+ struct f2fs_rwsem i_xattr_sem; /* avoid racing between reading and changing EAs */
int i_extra_isize; /* size of extra space located in i_addr */
kprojid_t i_projid; /* id for project quota */
@@ -897,6 +922,7 @@ struct f2fs_nm_info {
nid_t max_nid; /* maximum possible node ids */
nid_t available_nids; /* # of available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
+ nid_t max_rf_node_blocks; /* max # of nodes for recovery */
unsigned int ram_thresh; /* control the memory footprint */
unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */
@@ -904,7 +930,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct rw_semaphore nat_tree_lock; /* protect nat entry tree */
+ struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
struct list_head nat_entries; /* cached nat entry list (clean) */
spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
@@ -1017,7 +1043,7 @@ struct f2fs_sm_info {
struct dirty_seglist_info *dirty_info; /* dirty segment information */
struct curseg_info *curseg_array; /* active segment information */
- struct rw_semaphore curseg_lock; /* for preventing curseg change */
+ struct f2fs_rwsem curseg_lock; /* for preventing curseg change */
block_t seg0_blkaddr; /* block address of 0'th segment */
block_t main_blkaddr; /* start block address of main area */
@@ -1201,11 +1227,11 @@ struct f2fs_bio_info {
struct bio *bio; /* bios to merge */
sector_t last_block_in_bio; /* last block number */
struct f2fs_io_info fio; /* store buffered io info. */
- struct rw_semaphore io_rwsem; /* blocking op for bio */
+ struct f2fs_rwsem io_rwsem; /* blocking op for bio */
spinlock_t io_lock; /* serialize DATA/NODE IOs */
struct list_head io_list; /* track fios */
struct list_head bio_list; /* bio entry list head */
- struct rw_semaphore bio_list_lock; /* lock to protect bio entry list */
+ struct f2fs_rwsem bio_list_lock; /* lock to protect bio entry list */
};
#define FDEV(i) (sbi->devs[i])
@@ -1267,6 +1293,7 @@ enum {
SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */
SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */
SBI_IS_RESIZEFS, /* resizefs is in process */
+ SBI_IS_FREEZING, /* freezefs is in process */
};
enum {
@@ -1286,6 +1313,7 @@ enum {
GC_IDLE_AT,
GC_URGENT_HIGH,
GC_URGENT_LOW,
+ GC_URGENT_MID,
MAX_GC_MODE,
};
@@ -1571,7 +1599,7 @@ struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
struct f2fs_super_block *raw_super; /* raw super block pointer */
- struct rw_semaphore sb_lock; /* lock for raw super block */
+ struct f2fs_rwsem sb_lock; /* lock for raw super block */
int valid_super_block; /* valid super block no */
unsigned long s_flag; /* flags for sbi */
struct mutex writepages; /* mutex for writepages() */
@@ -1591,18 +1619,20 @@ struct f2fs_sb_info {
/* for bio operations */
struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */
/* keep migration IO order for LFS mode */
- struct rw_semaphore io_order_lock;
+ struct f2fs_rwsem io_order_lock;
mempool_t *write_io_dummy; /* Dummy pages */
+ pgoff_t metapage_eio_ofs; /* EIO page offset */
+ int metapage_eio_cnt; /* EIO count */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
int cur_cp_pack; /* remain current cp pack */
spinlock_t cp_lock; /* for flag in ckpt */
struct inode *meta_inode; /* cache meta blocks */
- struct rw_semaphore cp_global_sem; /* checkpoint procedure lock */
- struct rw_semaphore cp_rwsem; /* blocking FS operations */
- struct rw_semaphore node_write; /* locking node writes */
- struct rw_semaphore node_change; /* locking node change */
+ struct f2fs_rwsem cp_global_sem; /* checkpoint procedure lock */
+ struct f2fs_rwsem cp_rwsem; /* blocking FS operations */
+ struct f2fs_rwsem node_write; /* locking node writes */
+ struct f2fs_rwsem node_change; /* locking node change */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
long interval_time[MAX_TIME]; /* to store thresholds */
@@ -1662,12 +1692,14 @@ struct f2fs_sb_info {
block_t unusable_block_count; /* # of blocks saved by last cp */
unsigned int nquota_files; /* # of quota sysfile */
- struct rw_semaphore quota_sem; /* blocking cp for flags */
+ struct f2fs_rwsem quota_sem; /* blocking cp for flags */
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
/* # of allocated blocks */
struct percpu_counter alloc_valid_block_count;
+ /* # of node block writes as roll forward recovery */
+ struct percpu_counter rf_node_block_count;
/* writeback control */
atomic_t wb_sync_req[META]; /* count # of WB_SYNC threads */
@@ -1678,7 +1710,7 @@ struct f2fs_sb_info {
struct f2fs_mount_info mount_opt; /* mount options */
/* for cleaning operations */
- struct rw_semaphore gc_lock; /*
+ struct f2fs_rwsem gc_lock; /*
* semaphore for GC, avoid
* race between GC and GC or CP
*/
@@ -1698,7 +1730,7 @@ struct f2fs_sb_info {
/* threshold for gc trials on pinned files */
u64 gc_pin_file_threshold;
- struct rw_semaphore pin_sem;
+ struct f2fs_rwsem pin_sem;
/* maximum # of trials to find a victim segment for SSR and GC */
unsigned int max_victim_search;
@@ -2092,9 +2124,81 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
+#define init_f2fs_rwsem(sem) \
+do { \
+ static struct lock_class_key __key; \
+ \
+ __init_f2fs_rwsem((sem), #sem, &__key); \
+} while (0)
+
+static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
+ const char *sem_name, struct lock_class_key *key)
+{
+ __init_rwsem(&sem->internal_rwsem, sem_name, key);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ init_waitqueue_head(&sem->read_waiters);
+#endif
+}
+
+static inline int f2fs_rwsem_is_locked(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_locked(&sem->internal_rwsem);
+}
+
+static inline int f2fs_rwsem_is_contended(struct f2fs_rwsem *sem)
+{
+ return rwsem_is_contended(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_read(struct f2fs_rwsem *sem)
+{
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wait_event(sem->read_waiters, down_read_trylock(&sem->internal_rwsem));
+#else
+ down_read(&sem->internal_rwsem);
+#endif
+}
+
+static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem)
+{
+ return down_read_trylock(&sem->internal_rwsem);
+}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass)
+{
+ down_read_nested(&sem->internal_rwsem, subclass);
+}
+#else
+#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem)
+#endif
+
+static inline void f2fs_up_read(struct f2fs_rwsem *sem)
+{
+ up_read(&sem->internal_rwsem);
+}
+
+static inline void f2fs_down_write(struct f2fs_rwsem *sem)
+{
+ down_write(&sem->internal_rwsem);
+}
+
+static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem)
+{
+ return down_write_trylock(&sem->internal_rwsem);
+}
+
+static inline void f2fs_up_write(struct f2fs_rwsem *sem)
+{
+ up_write(&sem->internal_rwsem);
+#ifdef CONFIG_F2FS_UNFAIR_RWSEM
+ wake_up_all(&sem->read_waiters);
+#endif
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- down_read(&sbi->cp_rwsem);
+ f2fs_down_read(&sbi->cp_rwsem);
}
static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
@@ -2103,22 +2207,22 @@ static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi)
f2fs_show_injection_info(sbi, FAULT_LOCK_OP);
return 0;
}
- return down_read_trylock(&sbi->cp_rwsem);
+ return f2fs_down_read_trylock(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- up_read(&sbi->cp_rwsem);
+ f2fs_up_read(&sbi->cp_rwsem);
}
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- down_write(&sbi->cp_rwsem);
+ f2fs_down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- up_write(&sbi->cp_rwsem);
+ f2fs_up_write(&sbi->cp_rwsem);
}
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
@@ -2681,6 +2785,9 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
if (is_inflight_io(sbi, type))
return false;
+ if (sbi->gc_mode == GC_URGENT_MID)
+ return true;
+
if (sbi->gc_mode == GC_URGENT_LOW &&
(type == DISCARD_TIME || type == GC_TIME))
return true;
@@ -3579,7 +3686,8 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
-void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
+void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
+ unsigned int ra_blocks);
long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write, enum iostat_type io_type);
void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
@@ -3597,7 +3705,7 @@ void f2fs_add_orphan_inode(struct inode *inode);
void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino);
int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi);
int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
-void f2fs_update_dirty_page(struct inode *inode, struct page *page);
+void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
@@ -3661,8 +3769,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
enum iostat_type io_type,
int compr_blocks, bool allow_balance);
void f2fs_write_failed(struct inode *inode, loff_t to);
-void f2fs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length);
+void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
int f2fs_release_page(struct page *page, gfp_t wait);
#ifdef CONFIG_MIGRATION
int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
@@ -4371,7 +4478,11 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
- if (f2fs_post_read_required(inode))
+ if (!fscrypt_dio_supported(iocb, iter))
+ return true;
+ if (fsverity_active(inode))
+ return true;
+ if (f2fs_compressed_file(inode))
return true;
/* disallow direct IO if any of devices has unaligned blksize */
@@ -4426,6 +4537,12 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
}
+static inline void f2fs_io_schedule_timeout(long timeout)
+{
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(timeout);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c98ef6af97d..d3f39a704b8b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -237,13 +237,13 @@ static void try_to_fix_pino(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
nid_t pino;
- down_write(&fi->i_sem);
+ f2fs_down_write(&fi->i_sem);
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
f2fs_i_pino_write(inode, pino);
file_got_pino(inode);
}
- up_write(&fi->i_sem);
+ f2fs_up_write(&fi->i_sem);
}
static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
@@ -318,9 +318,9 @@ go_write:
* Both of fdatasync() and fsync() are able to be recovered from
* sudden-power-off.
*/
- down_read(&F2FS_I(inode)->i_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_sem);
cp_reason = need_do_checkpoint(inode);
- up_read(&F2FS_I(inode)->i_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_sem);
if (cp_reason) {
/* all the dirty node pages should be flushed for POR */
@@ -812,7 +812,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_inode *ri;
+ struct f2fs_inode *ri = NULL;
unsigned int flags;
if (f2fs_has_extra_attr(inode) &&
@@ -844,7 +844,7 @@ int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(&init_user_ns, inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
/* we need to show initial sectors used for inline_data/dentries */
if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
@@ -904,7 +904,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
- err = setattr_prepare(&init_user_ns, dentry, attr);
+ err = setattr_prepare(mnt_userns, dentry, attr);
if (err)
return err;
@@ -958,7 +958,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_setsize(inode, attr->ia_size);
@@ -970,7 +970,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* larger than i_size.
*/
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -980,10 +980,10 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
- __setattr_copy(&init_user_ns, inode, attr);
+ __setattr_copy(mnt_userns, inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode));
+ err = posix_acl_chmod(mnt_userns, inode, f2fs_get_inode_mode(inode));
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
if (!err)
@@ -1112,7 +1112,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
blk_start = (loff_t)pg_start << PAGE_SHIFT;
blk_end = (loff_t)pg_end << PAGE_SHIFT;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
truncate_pagecache_range(inode, blk_start, blk_end - 1);
@@ -1122,7 +1122,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
}
}
@@ -1355,7 +1355,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi, true);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
f2fs_lock_op(sbi);
@@ -1365,7 +1365,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
return ret;
}
@@ -1500,7 +1500,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
unsigned int end_offset;
pgoff_t end;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache_range(inode,
@@ -1514,7 +1514,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret) {
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -1526,7 +1526,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_unlock_op(sbi);
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_balance_fs(sbi, dn.node_changed);
@@ -1600,7 +1600,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
/* avoid gc operation during block exchange */
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
truncate_pagecache(inode, offset);
@@ -1618,7 +1618,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
f2fs_unlock_op(sbi);
}
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/* write out all moved pages, if possible */
filemap_invalidate_lock(mapping);
@@ -1674,13 +1674,13 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err && err != -ENODATA && err != -EAGAIN)
goto out_err;
}
- down_write(&sbi->pin_sem);
+ f2fs_down_write(&sbi->pin_sem);
f2fs_lock_op(sbi);
f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
@@ -1690,7 +1690,7 @@ next_alloc:
err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
file_dont_truncate(inode);
- up_write(&sbi->pin_sem);
+ f2fs_up_write(&sbi->pin_sem);
expanded += map.m_len;
sec_len -= map.m_len;
@@ -1989,11 +1989,12 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
static int f2fs_ioc_start_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2008,7 +2009,10 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode_lock(inode);
- f2fs_disable_compressed_file(inode);
+ if (!f2fs_disable_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (f2fs_is_atomic_file(inode)) {
if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
@@ -2020,7 +2024,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
if (ret)
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
@@ -2031,7 +2035,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
inode->i_ino, get_dirty_pages(inode));
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret) {
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
goto out;
}
@@ -2044,7 +2048,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
/* add inode in inmem_list first and set atomic_file */
set_inode_flag(inode, FI_ATOMIC_FILE);
clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
F2FS_I(inode)->inmem_task = current;
@@ -2058,9 +2062,10 @@ out:
static int f2fs_ioc_commit_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2100,9 +2105,10 @@ err_out:
static int f2fs_ioc_start_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
if (!S_ISREG(inode->i_mode))
@@ -2135,9 +2141,10 @@ out:
static int f2fs_ioc_release_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2164,9 +2171,10 @@ out:
static int f2fs_ioc_abort_volatile_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
+ struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
int ret;
- if (!inode_owner_or_capable(&init_user_ns, inode))
+ if (!inode_owner_or_capable(mnt_userns, inode))
return -EACCES;
ret = mnt_want_write_file(filp);
@@ -2351,7 +2359,7 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
if (err)
return err;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt))
goto got_it;
@@ -2370,7 +2378,7 @@ got_it:
16))
err = -EFAULT;
out_err:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
return err;
}
@@ -2447,12 +2455,12 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
return ret;
if (!sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO);
@@ -2483,12 +2491,12 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range)
do_more:
if (!range->sync) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
}
ret = f2fs_gc(sbi, range->sync, true, false,
@@ -2559,10 +2567,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
bool fragmented = false;
int err;
- /* if in-place-update policy is enabled, don't waste time here */
- if (f2fs_should_update_inplace(inode, NULL))
- return -EINVAL;
-
pg_start = range->start >> PAGE_SHIFT;
pg_end = (range->start + range->len) >> PAGE_SHIFT;
@@ -2570,6 +2574,13 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
inode_lock(inode);
+ /* if in-place-update policy is enabled, don't waste time here */
+ set_inode_flag(inode, FI_OPU_WRITE);
+ if (f2fs_should_update_inplace(inode, NULL)) {
+ err = -EINVAL;
+ goto out;
+ }
+
/* writeback all dirty pages in the range */
err = filemap_write_and_wait_range(inode->i_mapping, range->start,
range->start + range->len - 1);
@@ -2651,7 +2662,7 @@ do_map:
goto check;
}
- set_inode_flag(inode, FI_DO_DEFRAG);
+ set_inode_flag(inode, FI_SKIP_WRITES);
idx = map.m_lblk;
while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
@@ -2676,15 +2687,16 @@ check:
if (map.m_lblk < pg_end && cnt < blk_per_seg)
goto do_map;
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
goto out;
}
clear_out:
- clear_inode_flag(inode, FI_DO_DEFRAG);
+ clear_inode_flag(inode, FI_SKIP_WRITES);
out:
+ clear_inode_flag(inode, FI_OPU_WRITE);
inode_unlock(inode);
if (!err)
range->len = (u64)total << PAGE_SHIFT;
@@ -2820,10 +2832,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_balance_fs(sbi, true);
- down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
if (src != dst) {
ret = -EBUSY;
- if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
+ if (!f2fs_down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE]))
goto out_src;
}
@@ -2841,9 +2853,9 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
f2fs_unlock_op(sbi);
if (src != dst)
- up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]);
out_src:
- up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]);
out_unlock:
if (src != dst)
inode_unlock(dst);
@@ -2938,7 +2950,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
end_segno = min(start_segno + range.segments, dev_end_segno);
while (start_segno < end_segno) {
- if (!down_write_trylock(&sbi->gc_lock)) {
+ if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
ret = -EBUSY;
goto out;
}
@@ -2990,7 +3002,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *ipage;
+ struct f2fs_inode *ri = NULL;
kprojid_t kprojid;
int err;
@@ -3014,17 +3026,8 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
if (IS_NOQUOTA(inode))
return err;
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
-
- if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize,
- i_projid)) {
- err = -EOVERFLOW;
- f2fs_put_page(ipage, 1);
- return err;
- }
- f2fs_put_page(ipage, 1);
+ if (!F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+ return -EOVERFLOW;
err = f2fs_dquot_initialize(inode);
if (err)
@@ -3215,9 +3218,9 @@ int f2fs_precache_extents(struct inode *inode)
while (map.m_lblk < end) {
map.m_len = end - map.m_lblk;
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_PRECACHE);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
if (err)
return err;
@@ -3294,11 +3297,11 @@ static int f2fs_ioc_getfslabel(struct file *filp, unsigned long arg)
if (!vbuf)
return -ENOMEM;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
count = utf16s_to_utf8s(sbi->raw_super->volume_name,
ARRAY_SIZE(sbi->raw_super->volume_name),
UTF16_LITTLE_ENDIAN, vbuf, MAX_VOLUME_NAME);
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (copy_to_user((char __user *)arg, vbuf,
min(FSLABEL_MAX, count)))
@@ -3326,7 +3329,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
if (err)
goto out;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
memset(sbi->raw_super->volume_name, 0,
sizeof(sbi->raw_super->volume_name));
@@ -3336,7 +3339,7 @@ static int f2fs_ioc_setfslabel(struct file *filp, unsigned long arg)
err = f2fs_commit_super(sbi, false);
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
mnt_drop_write_file(filp);
out:
@@ -3462,7 +3465,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
if (!atomic_read(&F2FS_I(inode)->i_compr_blocks))
goto out;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3499,7 +3502,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
out:
inode_unlock(inode);
@@ -3615,7 +3618,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
goto unlock_inode;
}
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
@@ -3652,7 +3655,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
}
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
@@ -3770,7 +3773,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
if (ret)
goto err;
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
ret = filemap_write_and_wait_range(mapping, range.start,
@@ -3859,7 +3862,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
prev_block, len, range.flags);
out:
filemap_invalidate_unlock(mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
err:
inode_unlock(inode);
file_end_write(filp);
@@ -4291,12 +4294,12 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
trace_f2fs_direct_IO_enter(inode, iocb, count, READ);
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (!down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
ret = -EAGAIN;
goto out;
}
} else {
- down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
/*
@@ -4315,7 +4318,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = iomap_dio_complete(dio);
}
- up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
file_accessed(file);
out:
@@ -4479,10 +4482,8 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
const bool do_opu = f2fs_lfs_mode(sbi);
- const int whint_mode = F2FS_OPTION(sbi).whint_mode;
const loff_t pos = iocb->ki_pos;
const ssize_t count = iov_iter_count(from);
- const enum rw_hint hint = iocb->ki_hint;
unsigned int dio_flags;
struct iomap_dio *dio;
ssize_t ret;
@@ -4497,12 +4498,12 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
goto out;
}
- if (!down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
+ if (!f2fs_down_read_trylock(&fi->i_gc_rwsem[WRITE])) {
ret = -EAGAIN;
goto out;
}
- if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
- up_read(&fi->i_gc_rwsem[WRITE]);
+ if (do_opu && !f2fs_down_read_trylock(&fi->i_gc_rwsem[READ])) {
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
ret = -EAGAIN;
goto out;
}
@@ -4511,12 +4512,10 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (ret)
goto out;
- down_read(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_read(&fi->i_gc_rwsem[WRITE]);
if (do_opu)
- down_read(&fi->i_gc_rwsem[READ]);
+ f2fs_down_read(&fi->i_gc_rwsem[READ]);
}
- if (whint_mode == WHINT_MODE_OFF)
- iocb->ki_hint = WRITE_LIFE_NOT_SET;
/*
* We have to use __iomap_dio_rw() and iomap_dio_complete() instead of
@@ -4539,11 +4538,9 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
ret = iomap_dio_complete(dio);
}
- if (whint_mode == WHINT_MODE_OFF)
- iocb->ki_hint = hint;
if (do_opu)
- up_read(&fi->i_gc_rwsem[READ]);
- up_read(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_read(&fi->i_gc_rwsem[READ]);
+ f2fs_up_read(&fi->i_gc_rwsem[WRITE]);
if (ret < 0)
goto out;
@@ -4644,12 +4641,12 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
/* Don't leave any preallocated blocks around past i_size. */
if (preallocated && i_size_read(inode) < target_size) {
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
if (!f2fs_truncate(inode))
file_dont_truncate(inode);
filemap_invalidate_unlock(inode->i_mapping);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
} else {
file_dont_truncate(inode);
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index ee308a8de432..ea5b93b689cd 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -103,23 +103,26 @@ static int gc_thread_func(void *data)
sbi->gc_urgent_high_remaining--;
}
spin_unlock(&sbi->gc_urgent_high_lock);
+ }
+ if (sbi->gc_mode == GC_URGENT_HIGH ||
+ sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
}
if (foreground) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
goto do_gc;
- } else if (!down_write_trylock(&sbi->gc_lock)) {
+ } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
stat_other_skip_bggc_count(sbi);
goto next;
}
if (!is_idle(sbi, GC_TIME)) {
increase_sleep_time(gc_th, &wait_ms);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
stat_io_skip_bggc_count(sbi);
goto next;
}
@@ -1038,8 +1041,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- if (f2fs_check_nid_range(sbi, dni->ino))
+ if (f2fs_check_nid_range(sbi, dni->ino)) {
+ f2fs_put_page(node_page, 1);
return false;
+ }
*nofs = ofs_of_node(node_page);
source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
@@ -1230,7 +1235,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
if (lfs_mode)
- down_write(&fio.sbi->io_order_lock);
+ f2fs_down_write(&fio.sbi->io_order_lock);
mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
fio.old_blkaddr, false);
@@ -1316,7 +1321,7 @@ recover_block:
true, true, true);
up_out:
if (lfs_mode)
- up_write(&fio.sbi->io_order_lock);
+ f2fs_up_write(&fio.sbi->io_order_lock);
put_out:
f2fs_put_dnode(&dn);
out:
@@ -1475,7 +1480,7 @@ next_step:
special_file(inode->i_mode))
continue;
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&F2FS_I(inode)->i_gc_rwsem[WRITE])) {
iput(inode);
sbi->skipped_gc_rwsem++;
@@ -1488,7 +1493,7 @@ next_step:
if (f2fs_post_read_required(inode)) {
int err = ra_data_block(inode, start_bidx);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (err) {
iput(inode);
continue;
@@ -1499,7 +1504,7 @@ next_step:
data_page = f2fs_get_read_data_page(inode,
start_bidx, REQ_RAHEAD, true);
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -1518,14 +1523,14 @@ next_step:
int err;
if (S_ISREG(inode->i_mode)) {
- if (!down_write_trylock(&fi->i_gc_rwsem[READ])) {
+ if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) {
sbi->skipped_gc_rwsem++;
continue;
}
- if (!down_write_trylock(
+ if (!f2fs_down_write_trylock(
&fi->i_gc_rwsem[WRITE])) {
sbi->skipped_gc_rwsem++;
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
continue;
}
locked = true;
@@ -1548,8 +1553,8 @@ next_step:
submitted++;
if (locked) {
- up_write(&fi->i_gc_rwsem[WRITE]);
- up_write(&fi->i_gc_rwsem[READ]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[READ]);
}
stat_inc_data_blk_count(sbi, 1, gc_type);
@@ -1807,7 +1812,7 @@ stop:
reserved_segments(sbi),
prefree_segments(sbi));
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
put_gc_inode(&gc_list);
@@ -1936,7 +1941,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
long long block_count;
int segs = secs * sbi->segs_per_sec;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
section_count = le32_to_cpu(raw_sb->section_count);
segment_count = le32_to_cpu(raw_sb->segment_count);
@@ -1957,7 +1962,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
cpu_to_le32(dev_segs + segs);
}
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -2031,7 +2036,7 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
/* stop other GC */
- if (!down_write_trylock(&sbi->gc_lock))
+ if (!f2fs_down_write_trylock(&sbi->gc_lock))
return -EAGAIN;
/* stop CP to protect MAIN_SEC in free_segment_range */
@@ -2051,15 +2056,15 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
out_unlock:
f2fs_unlock_op(sbi);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
return err;
set_sbi_flag(sbi, SBI_IS_RESIZEFS);
freeze_super(sbi->sb);
- down_write(&sbi->gc_lock);
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->cp_global_sem);
spin_lock(&sbi->stat_lock);
if (shrunk_blocks + valid_user_blocks(sbi) +
@@ -2104,8 +2109,8 @@ recover_out:
spin_unlock(&sbi->stat_lock);
}
out_err:
- up_write(&sbi->cp_global_sem);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
return err;
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index e3beac546c63..3cb1e7a24740 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -105,7 +105,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname)
return;
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (IS_CASEFOLDED(dir)) {
/*
* If the casefolded name is provided, hash it instead of the
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 4b5cefa3f90c..a578bf83b803 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -629,7 +629,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
}
if (inode) {
- down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -658,7 +658,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
f2fs_update_parent_metadata(dir, inode, 0);
fail:
if (inode)
- up_write(&F2FS_I(inode)->i_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
out:
f2fs_put_page(ipage, 1);
return err;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 0ec8e32a00b4..71f232dcf3c2 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -778,7 +778,8 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
f2fs_remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
- sb_start_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_start_intwrite(inode->i_sb);
set_inode_flag(inode, FI_NO_ALLOC);
i_size_write(inode, 0);
retry:
@@ -809,7 +810,8 @@ retry:
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
}
- sb_end_intwrite(inode->i_sb);
+ if (!is_sbi_flag_set(sbi, SBI_IS_FREEZING))
+ sb_end_intwrite(inode->i_sb);
no_delete:
dquot_drop(inode);
@@ -885,6 +887,7 @@ void f2fs_handle_failed_inode(struct inode *inode)
err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_inode_flag(inode, FI_FREE_NID);
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
goto out;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a728a0af9ce0..5ed79b29999f 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,7 +22,8 @@
#include "acl.h"
#include <trace/events/f2fs.h>
-static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
+static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
nid_t ino;
@@ -46,7 +47,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_free = true;
- inode_init_owner(&init_user_ns, inode, dir, mode);
+ inode_init_owner(mnt_userns, inode, dir, mode);
inode->i_ino = ino;
inode->i_blocks = 0;
@@ -67,7 +68,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
(F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
else
- F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+ F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
F2FS_DEF_PROJID);
err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
@@ -196,7 +197,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
int i, cold_count, hot_count;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
@@ -206,7 +207,7 @@ static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *
break;
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
if (i == cold_count + hot_count)
return;
@@ -299,19 +300,19 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
(!ext_cnt && !noext_cnt))
return;
- down_read(&sbi->sb_lock);
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
for (i = cold_count; i < cold_count + hot_count; i++) {
if (is_extension_exist(name, extlist[i], false)) {
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
return;
}
}
- up_read(&sbi->sb_lock);
+ f2fs_up_read(&sbi->sb_lock);
for (i = 0; i < noext_cnt; i++) {
if (is_extension_exist(name, noext[i], false)) {
@@ -349,7 +350,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -561,7 +562,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out_iput;
}
out_splice:
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (!inode && IS_CASEFOLDED(dir)) {
/* Eventually we want to call d_add_ci(dentry, NULL)
* for negative dentries in the encoding case as
@@ -622,7 +623,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
f2fs_delete_entry(de, page, dir, inode);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
* invalidating the dentries here, alongside with returning the
@@ -679,7 +680,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -750,7 +751,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, S_IFDIR | mode);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -807,7 +808,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -834,8 +835,9 @@ out:
return err;
}
-static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
- umode_t mode, struct inode **whiteout)
+static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode,
+ struct inode **whiteout)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
@@ -845,7 +847,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (err)
return err;
- inode = f2fs_new_inode(dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -909,20 +911,22 @@ static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- return __f2fs_tmpfile(dir, dentry, mode, NULL);
+ return __f2fs_tmpfile(mnt_userns, dir, dentry, mode, NULL);
}
-static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
+static int f2fs_create_whiteout(struct user_namespace *mnt_userns,
+ struct inode *dir, struct inode **whiteout)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
return -EIO;
- return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
+ return __f2fs_tmpfile(mnt_userns, dir, NULL,
+ S_IFCHR | WHITEOUT_MODE, whiteout);
}
-static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+static int f2fs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = d_inode(old_dentry);
@@ -960,7 +964,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(old_dir, &whiteout);
+ err = f2fs_create_whiteout(mnt_userns, old_dir, &whiteout);
if (err)
return err;
}
@@ -1023,11 +1027,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_page = NULL;
new_inode->i_ctime = current_time(new_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
f2fs_i_links_write(new_inode, false);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
if (!new_inode->i_nlink)
f2fs_add_orphan_inode(new_inode);
@@ -1048,13 +1052,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_links_write(new_dir, true);
}
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
@@ -1107,8 +1111,7 @@ out_dir:
out_old:
f2fs_put_page(old_page, 0);
out:
- if (whiteout)
- iput(whiteout);
+ iput(whiteout);
return err;
}
@@ -1214,38 +1217,38 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
/* update directory entry info of old dir inode */
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
- down_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(old_inode, new_dir->i_ino);
- up_write(&F2FS_I(old_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
if (old_nlink) {
- down_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
- up_write(&F2FS_I(old_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(old_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
- down_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (!new_dir_entry)
file_lost_pino(new_inode);
else
/* adjust dir's i_pino to pass fsck check */
f2fs_i_pino_write(new_inode, old_dir->i_ino);
- up_write(&F2FS_I(new_inode)->i_sem);
+ f2fs_up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
if (new_nlink) {
- down_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
- up_write(&F2FS_I(new_dir)->i_sem);
+ f2fs_up_write(&F2FS_I(new_dir)->i_sem);
}
f2fs_mark_inode_dirty_sync(new_dir, false);
@@ -1300,7 +1303,8 @@ static int f2fs_rename2(struct user_namespace *mnt_userns,
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
+ return f2fs_rename(mnt_userns, old_dir, old_dentry,
+ new_dir, new_dentry, flags);
}
static const char *f2fs_encrypted_get_link(struct dentry *dentry,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 50b2874e758c..0b6e741e94a0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -382,14 +382,14 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need;
}
@@ -399,11 +399,11 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -413,13 +413,13 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -431,14 +431,14 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
struct nat_entry *new, *e;
/* Let's mitigate lock contention of nat_tree_lock during checkpoint */
- if (rwsem_is_locked(&sbi->cp_global_sem))
+ if (f2fs_rwsem_is_locked(&sbi->cp_global_sem))
return;
new = __alloc_nat_entry(sbi, nid, false);
if (!new)
return;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (!e)
e = __init_nat_entry(nm_i, new, ne, false);
@@ -447,7 +447,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
nat_get_blkaddr(e) !=
le32_to_cpu(ne->block_addr) ||
nat_get_version(e) != ne->version);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
if (e != new)
__free_nat_entry(new);
}
@@ -459,7 +459,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct nat_entry *e;
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = __init_nat_entry(nm_i, new, NULL, true);
@@ -508,7 +508,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -516,7 +516,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
- if (!down_write_trylock(&nm_i->nat_tree_lock))
+ if (!f2fs_down_write_trylock(&nm_i->nat_tree_lock))
return 0;
spin_lock(&nm_i->nat_list_lock);
@@ -538,7 +538,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
}
spin_unlock(&nm_i->nat_list_lock);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
@@ -560,13 +560,13 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
ni->nid = nid;
retry:
/* Check nat cache */
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return 0;
}
@@ -576,11 +576,11 @@ retry:
* nat_tree_lock. Therefore, we should retry, if we failed to grab here
* while not bothering checkpoint.
*/
- if (!rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
+ if (!f2fs_rwsem_is_locked(&sbi->cp_global_sem) || checkpoint_context) {
down_read(&curseg->journal_rwsem);
- } else if (rwsem_is_contended(&nm_i->nat_tree_lock) ||
+ } else if (f2fs_rwsem_is_contended(&nm_i->nat_tree_lock) ||
!down_read_trylock(&curseg->journal_rwsem)) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto retry;
}
@@ -589,15 +589,15 @@ retry:
ne = nat_in_journal(journal, i);
node_info_from_raw_nat(ni, &ne);
}
- up_read(&curseg->journal_rwsem);
+ up_read(&curseg->journal_rwsem);
if (i >= 0) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
goto cache;
}
/* Fill node_info from nat page */
index = current_nat_addr(sbi, nid);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
page = f2fs_get_meta_page(sbi, index);
if (IS_ERR(page))
@@ -1609,17 +1609,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
goto redirty_out;
if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
+ if (!f2fs_down_read_trylock(&sbi->node_write))
goto redirty_out;
} else {
- down_read(&sbi->node_write);
+ f2fs_down_read(&sbi->node_write);
}
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
@@ -1627,7 +1627,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (__is_valid_data_blkaddr(ni.blk_addr) &&
!f2fs_is_valid_blkaddr(sbi, ni.blk_addr,
DATA_GENERIC_ENHANCE)) {
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
goto redirty_out;
}
@@ -1648,7 +1648,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
- up_read(&sbi->node_write);
+ f2fs_up_read(&sbi->node_write);
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
@@ -1782,6 +1782,7 @@ continue_unlock:
if (!atomic || page == last_page) {
set_fsync_mark(page, 1);
+ percpu_counter_inc(&sbi->rf_node_block_count);
if (IS_INODE(page)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
@@ -2111,8 +2112,12 @@ static int f2fs_write_node_pages(struct address_space *mapping,
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[NODE]);
- else if (atomic_read(&sbi->wb_sync_req[NODE]))
+ else if (atomic_read(&sbi->wb_sync_req[NODE])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -2132,23 +2137,24 @@ skip_write:
return 0;
}
-static int f2fs_set_node_page_dirty(struct page *page)
+static bool f2fs_dirty_node_folio(struct address_space *mapping,
+ struct folio *folio)
{
- trace_f2fs_set_page_dirty(page, NODE);
+ trace_f2fs_set_page_dirty(&folio->page, NODE);
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
#ifdef CONFIG_F2FS_CHECK_FS
- if (IS_INODE(page))
- f2fs_inode_chksum_set(F2FS_P_SB(page), page);
+ if (IS_INODE(&folio->page))
+ f2fs_inode_chksum_set(F2FS_P_SB(&folio->page), &folio->page);
#endif
- if (!PageDirty(page)) {
- __set_page_dirty_nobuffers(page);
- inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
- set_page_private_reference(page);
- return 1;
+ if (!folio_test_dirty(folio)) {
+ filemap_dirty_folio(mapping, folio);
+ inc_page_count(F2FS_P_SB(&folio->page), F2FS_DIRTY_NODES);
+ set_page_private_reference(&folio->page);
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -2157,8 +2163,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
const struct address_space_operations f2fs_node_aops = {
.writepage = f2fs_write_node_page,
.writepages = f2fs_write_node_pages,
- .set_page_dirty = f2fs_set_node_page_dirty,
- .invalidatepage = f2fs_invalidate_page,
+ .dirty_folio = f2fs_dirty_node_folio,
+ .invalidate_folio = f2fs_invalidate_folio,
.releasepage = f2fs_release_page,
#ifdef CONFIG_MIGRATION
.migratepage = f2fs_migrate_page,
@@ -2225,14 +2231,14 @@ bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
unsigned int i;
bool ret = true;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
ret = false;
break;
}
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
return ret;
}
@@ -2415,7 +2421,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
unsigned int i, idx;
nid_t nid;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (i = 0; i < nm_i->nat_blocks; i++) {
if (!test_bit_le(i, nm_i->nat_block_bitmap))
@@ -2438,7 +2444,7 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
out:
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
@@ -2473,7 +2479,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
while (1) {
if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
@@ -2488,7 +2494,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
}
if (ret) {
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_err(sbi, "NAT is corrupt, run fsck to fix it");
return ret;
}
@@ -2508,7 +2514,7 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
/* find free nids from current sum_pages */
scan_curseg_cache(sbi);
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -2953,7 +2959,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_ofs;
- down_read(&nm_i->nat_tree_lock);
+ f2fs_down_read(&nm_i->nat_tree_lock);
for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
unsigned int valid = 0, nid_ofs = 0;
@@ -2973,7 +2979,7 @@ void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
__update_nat_bits(nm_i, nat_ofs, valid);
}
- up_read(&nm_i->nat_tree_lock);
+ f2fs_up_read(&nm_i->nat_tree_lock);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -3071,15 +3077,15 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* nat_cnt[DIRTY_NAT].
*/
if (cpc->reason & CP_UMOUNT) {
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
}
if (!nm_i->nat_cnt[DIRTY_NAT])
return 0;
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
@@ -3108,7 +3114,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
break;
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
/* Allow dirty nats by node block allocation in write_begin */
return err;
@@ -3218,6 +3224,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
+ nm_i->max_rf_node_blocks = DEF_RF_NODE_BLOCKS;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
@@ -3228,7 +3235,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->nid_list_lock);
- init_rwsem(&nm_i->nat_tree_lock);
+ init_f2fs_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -3334,7 +3341,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->nid_list_lock);
/* destroy nat cache */
- down_write(&nm_i->nat_tree_lock);
+ f2fs_down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -3364,7 +3371,7 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
- up_write(&nm_i->nat_tree_lock);
+ f2fs_up_write(&nm_i->nat_tree_lock);
kvfree(nm_i->nat_block_bitmap);
if (nm_i->free_nid_bitmap) {
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 18b98cf0465b..4c1d34bfea78 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -31,6 +31,9 @@
/* control total # of nats */
#define DEF_NAT_CACHE_THRESHOLD 100000
+/* control total # of node writes used for roll-fowrad recovery */
+#define DEF_RF_NODE_BLOCKS 0
+
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
#define SETVEC_SIZE 32
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9683c80ff8c2..3cb7f8a43b4d 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -46,7 +46,7 @@
static struct kmem_cache *fsync_entry_slab;
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
extern struct kmem_cache *f2fs_cf_name_slab;
#endif
@@ -56,6 +56,10 @@ bool f2fs_space_for_roll_forward(struct f2fs_sb_info *sbi)
if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
return false;
+ if (NM_I(sbi)->max_rf_node_blocks &&
+ percpu_counter_sum_positive(&sbi->rf_node_block_count) >=
+ NM_I(sbi)->max_rf_node_blocks)
+ return false;
return true;
}
@@ -149,7 +153,7 @@ static int init_recovered_filename(const struct inode *dir,
if (err)
return err;
f2fs_hash_filename(dir, fname);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/* Case-sensitive match is fine for recovery */
kmem_cache_free(f2fs_cf_name_slab, fname->cf_name.name);
fname->cf_name.name = NULL;
@@ -343,6 +347,19 @@ static int recover_inode(struct inode *inode, struct page *page)
return 0;
}
+static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
+ unsigned int ra_blocks, unsigned int blkaddr,
+ unsigned int next_blkaddr)
+{
+ if (blkaddr + 1 == next_blkaddr)
+ ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
+ ra_blocks * 2);
+ else if (next_blkaddr % sbi->blocks_per_seg)
+ ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
+ ra_blocks / 2);
+ return ra_blocks;
+}
+
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
bool check_only)
{
@@ -350,6 +367,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
struct page *page = NULL;
block_t blkaddr;
unsigned int loop_cnt = 0;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg -
valid_user_blocks(sbi);
int err = 0;
@@ -424,11 +442,14 @@ next:
break;
}
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
return err;
}
@@ -704,6 +725,7 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct page *page = NULL;
int err = 0;
block_t blkaddr;
+ unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -715,8 +737,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
- f2fs_ra_meta_pages_cond(sbi, blkaddr);
-
page = f2fs_get_tmp_page(sbi, blkaddr);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -759,9 +779,14 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
if (entry->blkaddr == blkaddr)
list_move_tail(&entry->list, tmp_inode_list);
next:
+ ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
+ next_blkaddr_of_node(page));
+
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
f2fs_put_page(page, 1);
+
+ f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
}
if (!err)
f2fs_allocate_new_segments(sbi);
@@ -796,7 +821,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
INIT_LIST_HEAD(&dir_list);
/* prevent checkpoint */
- down_write(&sbi->cp_global_sem);
+ f2fs_down_write(&sbi->cp_global_sem);
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only);
@@ -845,7 +870,7 @@ skip:
if (!err)
clear_sbi_flag(sbi, SBI_POR_DOING);
- up_write(&sbi->cp_global_sem);
+ f2fs_up_write(&sbi->cp_global_sem);
/* let's drop all the directory inodes for clean checkpoint */
destroy_fsync_dnodes(&dir_list, err);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1dabc8244083..22dfeb991529 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -313,8 +313,7 @@ next:
skip:
iput(inode);
}
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
- cond_resched();
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
if (gc_failure) {
if (++looped >= count)
return;
@@ -471,7 +470,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
f2fs_balance_fs(sbi, true);
- down_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
f2fs_lock_op(sbi);
set_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -483,7 +482,7 @@ int f2fs_commit_inmem_pages(struct inode *inode)
clear_inode_flag(inode, FI_ATOMIC_COMMIT);
f2fs_unlock_op(sbi);
- up_write(&fi->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
return err;
}
@@ -521,7 +520,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
io_schedule();
finish_wait(&sbi->gc_thread->fggc_wq, &wait);
} else {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_gc(sbi, false, false, false, NULL_SEGNO);
}
}
@@ -529,7 +528,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
{
- int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
+ int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2;
unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS);
unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -570,7 +569,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
/* there is background inflight IO or foreground operation recently */
if (is_inflight_io(sbi, REQ_TIME) ||
- (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem)))
+ (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem)))
return;
/* exceed periodical checkpoint timeout threshold */
@@ -803,8 +802,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
do {
ret = __submit_flush_wait(sbi, FDEV(i).bdev);
if (ret)
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
} while (ret && --count);
if (ret) {
@@ -1156,14 +1154,14 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->ordered = false;
dpolicy->granularity = granularity;
- dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+ dpolicy->max_requests = dcc->max_discard_request;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = true;
dpolicy->sync = false;
dpolicy->ordered = true;
@@ -1171,12 +1169,12 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->granularity = 1;
if (atomic_read(&dcc->discard_cmd_cnt))
dpolicy->max_interval =
- DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->min_discard_issue_time;
}
} else if (discard_type == DPOLICY_FORCE) {
- dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
- dpolicy->mid_interval = DEF_MID_DISCARD_ISSUE_TIME;
- dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+ dpolicy->min_interval = dcc->min_discard_issue_time;
+ dpolicy->mid_interval = dcc->mid_discard_issue_time;
+ dpolicy->max_interval = dcc->max_discard_issue_time;
dpolicy->io_aware = false;
} else if (discard_type == DPOLICY_FSTRIM) {
dpolicy->io_aware = false;
@@ -1781,7 +1779,7 @@ static int issue_discard_thread(void *data)
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
wait_queue_head_t *q = &dcc->discard_wait_queue;
struct discard_policy dpolicy;
- unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+ unsigned int wait_ms = dcc->min_discard_issue_time;
int issued;
set_freezable();
@@ -2180,6 +2178,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
atomic_set(&dcc->discard_cmd_cnt, 0);
dcc->nr_discards = 0;
dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+ dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
+ dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
+ dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
+ dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
dcc->undiscard_blks = 0;
dcc->next_pos = 0;
dcc->root = RB_ROOT_CACHED;
@@ -2821,7 +2823,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
if (!sbi->am.atgc_enabled)
return;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -2831,7 +2833,7 @@ static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
up_write(&SIT_I(sbi)->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
@@ -2982,7 +2984,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int segno;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&SIT_I(sbi)->sentry_lock);
@@ -3006,7 +3008,7 @@ unlock:
type, segno, curseg->segno);
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
@@ -3038,23 +3040,23 @@ static void __allocate_new_section(struct f2fs_sb_info *sbi,
void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
{
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
__allocate_new_section(sbi, type, force);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
{
int i;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
down_write(&SIT_I(sbi)->sentry_lock);
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
__allocate_new_segment(sbi, i, false, false);
up_write(&SIT_I(sbi)->sentry_lock);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
static const struct segment_allocation default_salloc_ops = {
@@ -3133,7 +3135,7 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto next;
}
skip:
@@ -3192,9 +3194,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
if (sbi->discard_blks == 0)
goto out;
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_write_checkpoint(sbi, &cpc);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
if (err)
goto out;
@@ -3431,7 +3433,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
- down_read(&SM_I(sbi)->curseg_lock);
+ f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
@@ -3514,7 +3516,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&curseg->curseg_mutex);
- up_read(&SM_I(sbi)->curseg_lock);
+ f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3550,7 +3552,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
- down_read(&fio->sbi->io_order_lock);
+ f2fs_down_read(&fio->sbi->io_order_lock);
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
@@ -3570,7 +3572,7 @@ reallocate:
f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
if (keep_order)
- up_read(&fio->sbi->io_order_lock);
+ f2fs_up_read(&fio->sbi->io_order_lock);
}
void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -3705,7 +3707,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
se = get_seg_entry(sbi, segno);
type = se->type;
- down_write(&SM_I(sbi)->curseg_lock);
+ f2fs_down_write(&SM_I(sbi)->curseg_lock);
if (!recover_curseg) {
/* for recovery flow */
@@ -3774,7 +3776,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
up_write(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
- up_write(&SM_I(sbi)->curseg_lock);
+ f2fs_up_write(&SM_I(sbi)->curseg_lock);
}
void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -4789,6 +4791,13 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
sanity_check_seg_type(sbi, curseg->seg_type);
+ if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) {
+ f2fs_err(sbi,
+ "Current segment has invalid alloc_type:%d",
+ curseg->alloc_type);
+ return -EFSCORRUPTED;
+ }
+
if (f2fs_test_bit(blkofs, se->cur_valid_map))
goto out;
@@ -5258,7 +5267,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sm_info->sit_entry_set);
- init_rwsem(&sm_info->curseg_lock);
+ init_f2fs_rwsem(&sm_info->curseg_lock);
if (!f2fs_readonly(sbi->sb)) {
err = f2fs_create_flush_cmd_control(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 0291cd55cf09..5c94caf0c0a1 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -651,7 +651,9 @@ static inline int utilization(struct f2fs_sb_info *sbi)
* pages over min_fsync_blocks. (=default option)
* F2FS_IPU_ASYNC - do IPU given by asynchronous write requests.
* F2FS_IPU_NOCACHE - disable IPU bio cache.
- * F2FS_IPUT_DISABLE - disable IPU. (=default option in LFS mode)
+ * F2FS_IPU_HONOR_OPU_WRITE - use OPU write prior to IPU write if inode has
+ * FI_OPU_WRITE flag.
+ * F2FS_IPU_DISABLE - disable IPU. (=default option in LFS mode)
*/
#define DEF_MIN_IPU_UTIL 70
#define DEF_MIN_FSYNC_BLOCKS 8
@@ -667,6 +669,7 @@ enum {
F2FS_IPU_FSYNC,
F2FS_IPU_ASYNC,
F2FS_IPU_NOCACHE,
+ F2FS_IPU_HONOR_OPU_WRITE,
};
static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 76e6a3df9aba..ea939db18f88 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -257,7 +257,7 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
va_end(args);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
static const struct f2fs_sb_encodings {
__u16 magic;
char *name;
@@ -1259,7 +1259,7 @@ default_check:
return -EINVAL;
}
#endif
-#ifndef CONFIG_UNICODE
+#if !IS_ENABLED(CONFIG_UNICODE)
if (f2fs_sb_has_casefold(sbi)) {
f2fs_err(sbi,
"Filesystem with casefold feature cannot be mounted without CONFIG_UNICODE");
@@ -1345,8 +1345,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
- fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep,
- GFP_F2FS_ZERO, false, F2FS_SB(sb));
+ if (time_to_inject(F2FS_SB(sb), FAULT_SLAB_ALLOC)) {
+ f2fs_show_injection_info(F2FS_SB(sb), FAULT_SLAB_ALLOC);
+ return NULL;
+ }
+
+ fi = alloc_inode_sb(sb, f2fs_inode_cachep, GFP_F2FS_ZERO);
if (!fi)
return NULL;
@@ -1355,16 +1359,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
atomic_set(&fi->i_compr_blocks, 0);
- init_rwsem(&fi->i_sem);
+ init_f2fs_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_ilist);
INIT_LIST_HEAD(&fi->inmem_pages);
mutex_init(&fi->inmem_lock);
- init_rwsem(&fi->i_gc_rwsem[READ]);
- init_rwsem(&fi->i_gc_rwsem[WRITE]);
- init_rwsem(&fi->i_xattr_sem);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
+ init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
+ init_f2fs_rwsem(&fi->i_xattr_sem);
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
@@ -1501,8 +1505,9 @@ static void f2fs_free_inode(struct inode *inode)
static void destroy_percpu_info(struct f2fs_sb_info *sbi)
{
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
}
static void destroy_device_list(struct f2fs_sb_info *sbi)
@@ -1619,7 +1624,7 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_iostat(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
kfree(sbi);
@@ -1662,11 +1667,15 @@ static int f2fs_freeze(struct super_block *sb)
/* ensure no checkpoint required */
if (!llist_empty(&F2FS_SB(sb)->cprc_info.issue_list))
return -EINVAL;
+
+ /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
+ set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
static int f2fs_unfreeze(struct super_block *sb)
{
+ clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
return 0;
}
@@ -2075,6 +2084,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
{
unsigned int s_flags = sbi->sb->s_flags;
struct cp_control cpc;
+ unsigned int gc_mode;
int err = 0;
int ret;
block_t unusable;
@@ -2087,8 +2097,11 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_update_time(sbi, DISABLE_TIME);
+ gc_mode = sbi->gc_mode;
+ sbi->gc_mode = GC_URGENT_HIGH;
+
while (!f2fs_time_over(sbi, DISABLE_TIME)) {
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
err = f2fs_gc(sbi, true, false, false, NULL_SEGNO);
if (err == -ENODATA) {
err = 0;
@@ -2110,7 +2123,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
goto restore_flag;
}
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
err = f2fs_write_checkpoint(sbi, &cpc);
@@ -2122,8 +2135,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
spin_unlock(&sbi->stat_lock);
out_unlock:
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
restore_flag:
+ sbi->gc_mode = gc_mode;
sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
@@ -2135,19 +2149,18 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
/* we should flush all the data to keep data consistency */
do {
sync_inodes_sb(sbi->sb);
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
} while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--);
if (unlikely(retry < 0))
f2fs_warn(sbi, "checkpoint=enable has some unwritten data.");
- down_write(&sbi->gc_lock);
+ f2fs_down_write(&sbi->gc_lock);
f2fs_dirty_to_prefree(sbi);
clear_sbi_flag(sbi, SBI_CP_DISABLED);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- up_write(&sbi->gc_lock);
+ f2fs_up_write(&sbi->gc_lock);
f2fs_sync_fs(sbi->sb, 1);
}
@@ -2504,8 +2517,7 @@ retry:
&page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC,
- DEFAULT_IO_TIMEOUT);
+ f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -2688,7 +2700,7 @@ int f2fs_quota_sync(struct super_block *sb, int type)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
- int ret;
+ int ret = 0;
/*
* Now when everything is written we can discard the pagecache so
@@ -2699,26 +2711,26 @@ int f2fs_quota_sync(struct super_block *sb, int type)
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_active(sb, type))
- return 0;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
inode_lock(dqopt->files[cnt]);
/*
* do_quotactl
* f2fs_quota_sync
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
* dquot_writeback_dquots()
* f2fs_dquot_commit
* block_operation
- * down_read(quota_sem)
+ * f2fs_down_read(quota_sem)
*/
f2fs_lock_op(sbi);
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = f2fs_quota_sync_file(sbi, cnt);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
f2fs_unlock_op(sbi);
inode_unlock(dqopt->files[cnt]);
@@ -2843,11 +2855,11 @@ static int f2fs_dquot_commit(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
+ f2fs_down_read_nested(&sbi->quota_sem, SINGLE_DEPTH_NESTING);
ret = dquot_commit(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -2856,11 +2868,11 @@ static int f2fs_dquot_acquire(struct dquot *dquot)
struct f2fs_sb_info *sbi = F2FS_SB(dquot->dq_sb);
int ret;
- down_read(&sbi->quota_sem);
+ f2fs_down_read(&sbi->quota_sem);
ret = dquot_acquire(dquot);
if (ret < 0)
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- up_read(&sbi->quota_sem);
+ f2fs_up_read(&sbi->quota_sem);
return ret;
}
@@ -3574,6 +3586,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
+ sbi->gc_mode = GC_NORMAL;
sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
@@ -3601,14 +3614,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&sbi->s_list);
mutex_init(&sbi->umount_mutex);
- init_rwsem(&sbi->io_order_lock);
+ init_f2fs_rwsem(&sbi->io_order_lock);
spin_lock_init(&sbi->cp_lock);
sbi->dirty_device = 0;
spin_lock_init(&sbi->dev_lock);
- init_rwsem(&sbi->sb_lock);
- init_rwsem(&sbi->pin_sem);
+ init_f2fs_rwsem(&sbi->sb_lock);
+ init_f2fs_rwsem(&sbi->pin_sem);
}
static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -3619,11 +3632,20 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
if (err)
return err;
+ err = percpu_counter_init(&sbi->rf_node_block_count, 0, GFP_KERNEL);
+ if (err)
+ goto err_valid_block;
+
err = percpu_counter_init(&sbi->total_valid_inode_count, 0,
GFP_KERNEL);
if (err)
- percpu_counter_destroy(&sbi->alloc_valid_block_count);
+ goto err_node_block;
+ return 0;
+err_node_block:
+ percpu_counter_destroy(&sbi->rf_node_block_count);
+err_valid_block:
+ percpu_counter_destroy(&sbi->alloc_valid_block_count);
return err;
}
@@ -3903,7 +3925,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (f2fs_sb_has_casefold(sbi) && !sbi->sb->s_encoding) {
const struct f2fs_sb_encodings *encoding_info;
struct unicode_map *encoding;
@@ -3957,7 +3979,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
if (f2fs_block_unit_discard(sbi))
sm_i->dcc_info->discard_granularity = 1;
- sm_i->ipu_policy = 1 << F2FS_IPU_FORCE;
+ sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+ 1 << F2FS_IPU_HONOR_OPU_WRITE;
}
sbi->readdir_ra = 1;
@@ -4067,11 +4090,11 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- init_rwsem(&sbi->gc_lock);
+ init_f2fs_rwsem(&sbi->gc_lock);
mutex_init(&sbi->writepages);
- init_rwsem(&sbi->cp_global_sem);
- init_rwsem(&sbi->node_write);
- init_rwsem(&sbi->node_change);
+ init_f2fs_rwsem(&sbi->cp_global_sem);
+ init_f2fs_rwsem(&sbi->node_write);
+ init_f2fs_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
@@ -4092,18 +4115,18 @@ try_onemore:
}
for (j = HOT; j < n; j++) {
- init_rwsem(&sbi->write_io[i][j].io_rwsem);
+ init_f2fs_rwsem(&sbi->write_io[i][j].io_rwsem);
sbi->write_io[i][j].sbi = sbi;
sbi->write_io[i][j].bio = NULL;
spin_lock_init(&sbi->write_io[i][j].io_lock);
INIT_LIST_HEAD(&sbi->write_io[i][j].io_list);
INIT_LIST_HEAD(&sbi->write_io[i][j].bio_list);
- init_rwsem(&sbi->write_io[i][j].bio_list_lock);
+ init_f2fs_rwsem(&sbi->write_io[i][j].bio_list_lock);
}
}
- init_rwsem(&sbi->cp_rwsem);
- init_rwsem(&sbi->quota_sem);
+ init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->quota_sem);
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
@@ -4458,7 +4481,7 @@ free_bio_info:
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
sb->s_encoding = NULL;
#endif
@@ -4528,7 +4551,7 @@ static struct file_system_type f2fs_fs_type = {
.name = "f2fs",
.mount = f2fs_mount,
.kill_sb = kill_f2fs_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("f2fs");
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index df406c16b2eb..4c50aedd5144 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -41,6 +41,16 @@ enum {
ATGC_INFO, /* struct atgc_management */
};
+static const char *gc_mode_names[MAX_GC_MODE] = {
+ "GC_NORMAL",
+ "GC_IDLE_CB",
+ "GC_IDLE_GREEDY",
+ "GC_IDLE_AT",
+ "GC_URGENT_HIGH",
+ "GC_URGENT_LOW",
+ "GC_URGENT_MID"
+};
+
struct f2fs_attr {
struct attribute attr;
ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
@@ -201,7 +211,7 @@ static ssize_t unusable_show(struct f2fs_attr *a,
static ssize_t encoding_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = sbi->sb;
if (f2fs_sb_has_casefold(sbi))
@@ -316,8 +326,13 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
+ if (!strcmp(a->attr.name, "gc_urgent"))
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_mode]);
+
if (!strcmp(a->attr.name, "gc_segment_mode"))
- return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
+ return sysfs_emit(buf, "%s\n",
+ gc_mode_names[sbi->gc_segment_mode]);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
@@ -363,7 +378,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
return -EINVAL;
- down_write(&sbi->sb_lock);
+ f2fs_down_write(&sbi->sb_lock);
ret = f2fs_update_extension_list(sbi, name, hot, set);
if (ret)
@@ -373,7 +388,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
if (ret)
f2fs_update_extension_list(sbi, name, hot, !set);
out:
- up_write(&sbi->sb_lock);
+ f2fs_up_write(&sbi->sb_lock);
return ret ? ret : count;
}
@@ -468,6 +483,13 @@ out:
}
} else if (t == 2) {
sbi->gc_mode = GC_URGENT_LOW;
+ } else if (t == 3) {
+ sbi->gc_mode = GC_URGENT_MID;
+ if (sbi->gc_thread) {
+ sbi->gc_thread->gc_wake = 1;
+ wake_up_interruptible_all(
+ &sbi->gc_thread->gc_wait_queue_head);
+ }
} else {
return -EINVAL;
}
@@ -481,7 +503,7 @@ out:
} else if (t == GC_IDLE_AT) {
if (!sbi->am.atgc_enabled)
return -EINVAL;
- sbi->gc_mode = GC_AT;
+ sbi->gc_mode = GC_IDLE_AT;
} else {
sbi->gc_mode = GC_NORMAL;
}
@@ -716,6 +738,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle, gc_mode);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent, gc_mode);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_request);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
@@ -728,6 +754,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, max_roll_forward_node_blocks, max_rf_node_blocks);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, migration_granularity, migration_granularity);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -778,7 +805,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks);
#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption);
F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
F2FS_FEATURE_RO_ATTR(encrypted_casefold);
#endif
#endif /* CONFIG_FS_ENCRYPTION */
@@ -797,7 +824,7 @@ F2FS_FEATURE_RO_ATTR(lost_found);
F2FS_FEATURE_RO_ATTR(verity);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum);
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
F2FS_FEATURE_RO_ATTR(casefold);
#endif
F2FS_FEATURE_RO_ATTR(readonly);
@@ -832,6 +859,10 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reclaim_segments),
ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
+ ATTR_LIST(max_discard_request),
+ ATTR_LIST(min_discard_issue_time),
+ ATTR_LIST(mid_discard_issue_time),
+ ATTR_LIST(max_discard_issue_time),
ATTR_LIST(discard_granularity),
ATTR_LIST(pending_discard),
ATTR_LIST(batched_trim_sections),
@@ -847,6 +878,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
ATTR_LIST(dirty_nats_ratio),
+ ATTR_LIST(max_roll_forward_node_blocks),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
@@ -910,7 +942,7 @@ static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
ATTR_LIST(test_dummy_encryption_v2),
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(encrypted_casefold),
#endif
#endif /* CONFIG_FS_ENCRYPTION */
@@ -929,7 +961,7 @@ static struct attribute *f2fs_feat_attrs[] = {
ATTR_LIST(verity),
#endif
ATTR_LIST(sb_checksum),
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(casefold),
#endif
ATTR_LIST(readonly),
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index fe5acdccaae1..3d793202cc9f 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -208,7 +208,7 @@ cleanup:
* from re-instantiating cached pages we are truncating (since unlike
* normal file accesses, garbage collection isn't limited by i_size).
*/
- down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
truncate_inode_pages(inode->i_mapping, inode->i_size);
err2 = f2fs_truncate(inode);
if (err2) {
@@ -216,7 +216,7 @@ cleanup:
err2);
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+ f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
clear_inode_flag(inode, FI_VERITY_IN_PROGRESS);
return err ?: err2;
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8e5cd9c916ff..c76c15086e5f 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -525,10 +525,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
&entry, &base_addr, &base_size, &is_inline);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -562,9 +562,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int error;
size_t rest = buffer_size;
- down_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
error = read_all_xattrs(inode, NULL, &base_addr);
- up_read(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -786,9 +786,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- down_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
- up_write(&F2FS_I(inode)->i_xattr_sem);
+ f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index c4a274285858..249825017da7 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -722,7 +722,7 @@ static int func(struct dir_context *ctx, const char *name, int name_len, \
if (name_len >= sizeof(d1->d_name)) \
name_len = sizeof(d1->d_name) - 1; \
\
- if (put_user(0, d2->d_name) || \
+ if (put_user(0, &d2->d_name[0]) || \
put_user(0, &d2->d_reclen) || \
copy_to_user(d1->d_name, name, name_len) || \
put_user(0, d1->d_name + name_len) || \
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index a6f1c6d426d1..bf6051bdf1d1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -342,7 +342,8 @@ int fat_block_truncate_page(struct inode *inode, loff_t from)
}
static const struct address_space_operations fat_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = fat_readpage,
.readahead = fat_readahead,
.writepage = fat_writepage,
@@ -745,7 +746,7 @@ static struct kmem_cache *fat_inode_cachep;
static struct inode *fat_alloc_inode(struct super_block *sb)
{
struct msdos_inode_info *ei;
- ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, fat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9c6c6a3e2de5..f15d885b9796 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -291,22 +291,6 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd,
u64 h;
switch (cmd) {
- case F_GET_FILE_RW_HINT:
- h = file_write_hint(file);
- if (copy_to_user(argp, &h, sizeof(*argp)))
- return -EFAULT;
- return 0;
- case F_SET_FILE_RW_HINT:
- if (copy_from_user(&h, argp, sizeof(h)))
- return -EFAULT;
- hint = (enum rw_hint) h;
- if (!rw_hint_valid(hint))
- return -EINVAL;
-
- spin_lock(&file->f_lock);
- file->f_write_hint = hint;
- spin_unlock(&file->f_lock);
- return 0;
case F_GET_RW_HINT:
h = inode->i_write_hint;
if (copy_to_user(argp, &h, sizeof(*argp)))
@@ -431,8 +415,6 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_GET_RW_HINT:
case F_SET_RW_HINT:
- case F_GET_FILE_RW_HINT:
- case F_SET_FILE_RW_HINT:
err = fcntl_rw_hint(filp, cmd, arg);
break;
default:
diff --git a/fs/file_table.c b/fs/file_table.c
index 57edef16dce4..7d2e692b66a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -27,6 +27,7 @@
#include <linux/task_work.h>
#include <linux/ima.h>
#include <linux/swap.h>
+#include <linux/kmemleak.h>
#include <linux/atomic.h>
@@ -119,6 +120,11 @@ static struct ctl_table fs_stat_sysctls[] = {
static int __init init_fs_stat_sysctls(void)
{
register_sysctl_init("fs", fs_stat_sysctls);
+ if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
+ struct ctl_table_header *hdr;
+ hdr = register_sysctl_mount_point("fs/binfmt_misc");
+ kmemleak_not_leak(hdr);
+ }
return 0;
}
fs_initcall(init_fs_stat_sysctls);
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 578a5062706e..22eed5a73ac2 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,7 +124,7 @@ static struct inode *vxfs_alloc_inode(struct super_block *sb)
{
struct vxfs_inode_info *vi;
- vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL);
+ vi = alloc_inode_sb(sb, vxfs_inode_cachep, GFP_KERNEL);
if (!vi)
return NULL;
inode_init_once(&vi->vfs_inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 33d54c9fbefc..591fe9cf1659 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -894,43 +894,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
/**
- * inode_congested - test whether an inode is congested
- * @inode: inode to test for congestion (may be NULL)
- * @cong_bits: mask of WB_[a]sync_congested bits to test
- *
- * Tests whether @inode is congested. @cong_bits is the mask of congestion
- * bits to test and the return value is the mask of set bits.
- *
- * If cgroup writeback is enabled for @inode, the congestion state is
- * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
- * associated with @inode is congested; otherwise, the root wb's congestion
- * state is used.
- *
- * @inode is allowed to be NULL as this function is often called on
- * mapping->host which is NULL for the swapper space.
- */
-int inode_congested(struct inode *inode, int cong_bits)
-{
- /*
- * Once set, ->i_wb never becomes NULL while the inode is alive.
- * Start transaction iff ->i_wb is visible.
- */
- if (inode && inode_to_wb_is_valid(inode)) {
- struct bdi_writeback *wb;
- struct wb_lock_cookie lock_cookie = {};
- bool congested;
-
- wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
- congested = wb_congested(wb, cong_bits);
- unlocked_inode_to_wb_end(inode, &lock_cookie);
- return congested;
- }
-
- return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
-}
-EXPORT_SYMBOL_GPL(inode_congested);
-
-/**
* wb_split_bdi_pages - split nr_pages to write according to bandwidth
* @wb: target bdi_writeback to split @nr_pages to
* @nr_pages: number of pages to write for the whole bdi
@@ -2233,7 +2196,6 @@ void wb_workfn(struct work_struct *work)
long pages_written;
set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
- current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
!test_bit(WB_registered, &wb->state))) {
@@ -2262,8 +2224,6 @@ void wb_workfn(struct work_struct *work)
wb_wakeup(wb);
else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
wb_wakeup_delayed(wb);
-
- current->flags &= ~PF_SWAPWRITE;
}
/*
diff --git a/fs/fscache/io.c b/fs/fscache/io.c
index 7a769ea57720..c8c7fe9e9a6e 100644
--- a/fs/fscache/io.c
+++ b/fs/fscache/io.c
@@ -159,27 +159,29 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
EXPORT_SYMBOL(__fscache_begin_write_operation);
/**
- * fscache_set_page_dirty - Mark page dirty and pin a cache object for writeback
- * @page: The page being dirtied
+ * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
* @cookie: The cookie referring to the cache object
*
- * Set the dirty flag on a page and pin an in-use cache object in memory when
- * dirtying a page so that writeback can later write to it. This is intended
- * to be called from the filesystem's ->set_page_dirty() method.
+ * Set the dirty flag on a folio and pin an in-use cache object in memory
+ * so that writeback can later write to it. This is intended
+ * to be called from the filesystem's ->dirty_folio() method.
*
- * Returns 1 if PG_dirty was set on the page, 0 otherwise.
+ * Return: true if the dirty flag was set on the folio, false otherwise.
*/
-int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie)
+bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
+ struct fscache_cookie *cookie)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = mapping->host;
bool need_use = false;
_enter("");
- if (!__set_page_dirty_nobuffers(page))
- return 0;
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
if (!fscache_cookie_valid(cookie))
- return 1;
+ return true;
if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
spin_lock(&inode->i_lock);
@@ -192,9 +194,9 @@ int fscache_set_page_dirty(struct page *page, struct fscache_cookie *cookie)
if (need_use)
fscache_use_cookie(cookie, true);
}
- return 1;
+ return true;
}
-EXPORT_SYMBOL(fscache_set_page_dirty);
+EXPORT_SYMBOL(fscache_dirty_folio);
struct fscache_write_request {
struct netfs_cache_resources cache_resources;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 000d2e5627e9..7cede9a3bc96 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -164,7 +164,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
{
unsigned val;
struct fuse_conn *fc;
- struct fuse_mount *fm;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -178,22 +177,6 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
down_read(&fc->killsb);
spin_lock(&fc->bg_lock);
fc->congestion_threshold = val;
-
- /*
- * Get any fuse_mount belonging to this fuse_conn; s_bdi is
- * shared between all of them
- */
-
- if (!list_empty(&fc->mounts)) {
- fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry);
- if (fc->num_background < fc->congestion_threshold) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- } else {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
- }
spin_unlock(&fc->bg_lock);
up_read(&fc->killsb);
fuse_conn_put(fc);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 182b24a14804..d7d3a7f06862 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -1326,8 +1326,7 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi)
static const struct address_space_operations fuse_dax_file_aops = {
.writepages = fuse_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
};
static bool fuse_should_enable_dax(struct inode *inode, unsigned int flags)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cd54a529460d..0e537e580dc1 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -315,10 +315,6 @@ void fuse_request_end(struct fuse_req *req)
wake_up(&fc->blocked_waitq);
}
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
fc->num_background--;
fc->active_background--;
flush_bg_queue(fc);
@@ -540,10 +536,6 @@ static bool fuse_request_queue_background(struct fuse_req *req)
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold && fm->sb) {
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC);
- set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC);
- }
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
queued = true;
@@ -941,7 +933,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
while (count) {
if (cs->write && cs->pipebufs && page) {
- return fuse_ref_page(cs, page, offset, count);
+ /*
+ * Can't control lifetime of pipe buffers, so always
+ * copy user pages.
+ */
+ if (cs->req->args->user_pages) {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ } else {
+ return fuse_ref_page(cs, page, offset, count);
+ }
} else if (!cs->len) {
if (cs->move_pages && page &&
offset == 0 && count == PAGE_SIZE) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 656e921f3506..9ff27b8a9782 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1773,7 +1773,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
/*
* Only call invalidate_inode_pages2() after removing
- * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+ * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock.
*/
if ((is_truncate || !is_wb) &&
S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 829094451774..f18d14d5fea1 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -966,6 +966,14 @@ static void fuse_readahead(struct readahead_control *rac)
struct fuse_io_args *ia;
struct fuse_args_pages *ap;
+ if (fc->num_background >= fc->congestion_threshold &&
+ rac->ra->async_size >= readahead_count(rac))
+ /*
+ * Congested and only async pages left, so skip the
+ * rest.
+ */
+ break;
+
nr_pages = readahead_count(rac) - nr_pages;
if (nr_pages > max_pages)
nr_pages = max_pages;
@@ -1413,6 +1421,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
+ ap->args.user_pages = true;
if (write)
ap->args.in_pages = true;
else
@@ -1958,6 +1967,7 @@ err:
static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
int err;
if (fuse_page_is_writeback(page->mapping->host, page->index)) {
@@ -1973,6 +1983,10 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return AOP_WRITEPAGE_ACTIVATE;
+
err = fuse_writepage_locked(page);
unlock_page(page);
@@ -2226,6 +2240,10 @@ static int fuse_writepages(struct address_space *mapping,
if (fuse_is_bad(inode))
goto out;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ fc->num_background >= fc->congestion_threshold)
+ return 0;
+
data.inode = inode;
data.wpa = NULL;
data.ff = NULL;
@@ -2330,17 +2348,17 @@ unlock:
return copied;
}
-static int fuse_launder_page(struct page *page)
+static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
- if (clear_page_dirty_for_io(page)) {
- struct inode *inode = page->mapping->host;
+ if (folio_clear_dirty_for_io(folio)) {
+ struct inode *inode = folio->mapping->host;
/* Serialize with pending writeback for the same page */
- fuse_wait_on_page_writeback(inode, page->index);
- err = fuse_writepage_locked(page);
+ fuse_wait_on_page_writeback(inode, folio->index);
+ err = fuse_writepage_locked(&folio->page);
if (!err)
- fuse_wait_on_page_writeback(inode, page->index);
+ fuse_wait_on_page_writeback(inode, folio->index);
}
return err;
}
@@ -3161,8 +3179,8 @@ static const struct address_space_operations fuse_file_aops = {
.readahead = fuse_readahead,
.writepage = fuse_writepage,
.writepages = fuse_writepages,
- .launder_page = fuse_launder_page,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .launder_folio = fuse_launder_folio,
+ .dirty_folio = filemap_dirty_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
.write_begin = fuse_write_begin,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e8e59fbdefeb..eac4984cc753 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -256,6 +256,7 @@ struct fuse_args {
bool nocreds:1;
bool in_pages:1;
bool out_pages:1;
+ bool user_pages:1;
bool out_argvar:1;
bool page_zeroing:1;
bool page_replace:1;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ee846ce371d8..8c0665c5dff8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -23,6 +23,7 @@
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/pid_namespace.h>
+#include <uapi/linux/magic.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -50,8 +51,6 @@ MODULE_PARM_DESC(max_user_congthresh,
"Global limit for the maximum congestion threshold an "
"unprivileged user can set");
-#define FUSE_SUPER_MAGIC 0x65735546
-
#define FUSE_DEFAULT_BLKSIZE 512
/** Maximum number of outstanding background requests */
@@ -73,7 +72,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
{
struct fuse_inode *fi;
- fi = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+ fi = alloc_inode_sb(sb, fuse_inode_cachep, GFP_KERNEL);
if (!fi)
return NULL;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index fbc09dab1f85..df58966bc874 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.out_args[1].value = ptr;
err = fuse_simple_request(fm, &args);
- if (!err && outarg.flags & FUSE_IOCTL_RETRY)
- err = -EIO;
-
+ if (!err) {
+ if (outarg.result < 0)
+ err = outarg.result;
+ else if (outarg.flags & FUSE_IOCTL_RETRY)
+ err = -EIO;
+ }
return err;
}
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 9d737904d07c..86b7dbb6a0d4 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -8,6 +8,7 @@
#include <linux/dax.h>
#include <linux/pci.h>
#include <linux/pfn_t.h>
+#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
#include <linux/virtio_fs.h>
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 005e920f5d4a..72c9f31ce724 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -606,18 +606,12 @@ out:
gfs2_trans_end(sdp);
}
-/**
- * jdata_set_page_dirty - Page dirtying function
- * @page: The page to dirty
- *
- * Returns: 1 if it dirtyed the page, or 0 otherwise
- */
-
-static int jdata_set_page_dirty(struct page *page)
+static bool jdata_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
if (current->journal_info)
- SetPageChecked(page);
- return __set_page_dirty_buffers(page);
+ folio_set_checked(folio);
+ return block_dirty_folio(mapping, folio);
}
/**
@@ -672,22 +666,23 @@ static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh)
unlock_buffer(bh);
}
-static void gfs2_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void gfs2_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
- unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_SIZE);
+ struct gfs2_sbd *sdp = GFS2_SB(folio->mapping->host);
+ size_t stop = offset + length;
+ int partial_page = (offset || length < folio_size(folio));
struct buffer_head *bh, *head;
unsigned long pos = 0;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!partial_page)
- ClearPageChecked(page);
- if (!page_has_buffers(page))
+ folio_clear_checked(folio);
+ head = folio_buffers(folio);
+ if (!head)
goto out;
- bh = head = page_buffers(page);
+ bh = head;
do {
if (pos + bh->b_size > stop)
return;
@@ -699,7 +694,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset,
} while (bh != head);
out:
if (!partial_page)
- try_to_release_page(page, 0);
+ filemap_release_folio(folio, 0);
}
/**
@@ -779,9 +774,9 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
@@ -794,9 +789,9 @@ static const struct address_space_operations gfs2_jdata_aops = {
.writepages = gfs2_jdata_writepages,
.readpage = gfs2_readpage,
.readahead = gfs2_readahead,
- .set_page_dirty = jdata_set_page_dirty,
+ .dirty_folio = jdata_dirty_folio,
.bmap = gfs2_bmap,
- .invalidatepage = gfs2_invalidatepage,
+ .invalidate_folio = gfs2_invalidate_folio,
.releasepage = gfs2_releasepage,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3e718cfc19a7..8c39a8571b1f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -704,10 +704,11 @@ static int gfs2_release(struct inode *inode, struct file *file)
kfree(file->private_data);
file->private_data = NULL;
- if (gfs2_rs_active(&ip->i_res))
- gfs2_rs_delete(ip, &inode->i_writecount);
- if (file->f_mode & FMODE_WRITE)
+ if (file->f_mode & FMODE_WRITE) {
+ if (gfs2_rs_active(&ip->i_res))
+ gfs2_rs_delete(ip, &inode->i_writecount);
gfs2_qa_put(ip);
+ }
return 0;
}
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index b7ab8430333c..6b23399eaee0 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -301,9 +301,6 @@ void gfs2_glock_queue_put(struct gfs2_glock *gl)
void gfs2_glock_put(struct gfs2_glock *gl)
{
- /* last put could call sleepable dlm api */
- might_sleep();
-
if (lockref_put_or_lock(&gl->gl_lockref))
return;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4ae1eefae616..6ba51cbb94cf 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -491,7 +491,6 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO);
bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_write_hint = prev->bi_write_hint;
bio_chain(new, prev);
submit_bio(prev);
return new;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index a580b90b7522..d8bd1d48bd78 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -89,13 +89,15 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
}
const struct address_space_operations gfs2_meta_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
const struct address_space_operations gfs2_rgrp_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = gfs2_aspace_writepage,
.releasepage = gfs2_releasepage,
};
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 64c67090f503..cf9cf66522b3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1425,7 +1425,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
{
struct gfs2_inode *ip;
- ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+ ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
if (!ip)
return NULL;
ip->i_flags = 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 2a5143246282..55f45e9b4930 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -159,7 +159,8 @@ static int hfs_writepages(struct address_space *mapping,
}
const struct address_space_operations hfs_btree_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
@@ -169,7 +170,8 @@ const struct address_space_operations hfs_btree_aops = {
};
const struct address_space_operations hfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfs_readpage,
.writepage = hfs_writepage,
.write_begin = hfs_write_begin,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 12d9bae39363..6764afa98a6f 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -162,7 +162,7 @@ static struct inode *hfs_alloc_inode(struct super_block *sb)
{
struct hfs_inode_info *i;
- i = kmem_cache_alloc(hfs_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, hfs_inode_cachep, GFP_KERNEL);
return i ? &i->vfs_inode : NULL;
}
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d08a8d1d40a4..446a816aa8e1 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -156,7 +156,8 @@ static int hfsplus_writepages(struct address_space *mapping,
}
const struct address_space_operations hfsplus_btree_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
@@ -166,7 +167,8 @@ const struct address_space_operations hfsplus_btree_aops = {
};
const struct address_space_operations hfsplus_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hfsplus_readpage,
.writepage = hfsplus_writepage,
.write_begin = hfsplus_write_begin,
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index b9e3db3f855f..8479add998b5 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -624,7 +624,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
{
struct hfsplus_inode_info *i;
- i = kmem_cache_alloc(hfsplus_inode_cachep, GFP_KERNEL);
+ i = alloc_inode_sb(sb, hfsplus_inode_cachep, GFP_KERNEL);
return i ? &i->vfs_inode : NULL;
}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index ef481c3d9019..14f9ac973a2e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -14,6 +14,7 @@
#include <linux/statfs.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
+#include <linux/writeback.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include "hostfs.h"
@@ -222,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
- hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
+ hi = alloc_inode_sb(sb, hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
@@ -504,7 +505,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
static const struct address_space_operations hostfs_aops = {
.writepage = hostfs_writepage,
.readpage = hostfs_readpage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
};
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index fb37f57130aa..99493a23c5d0 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -245,7 +245,8 @@ static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
const struct address_space_operations hpfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
.readahead = hpfs_readahead,
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a7dbfc892022..1cb89595b875 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -232,7 +232,7 @@ static struct kmem_cache * hpfs_inode_cachep;
static struct inode *hpfs_alloc_inode(struct super_block *sb)
{
struct hpfs_inode_info *ei;
- ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, hpfs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a7c6c7498be0..99c7477cee5c 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1110,7 +1110,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
return NULL;
- p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
+ p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
if (unlikely(!p)) {
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
@@ -1144,7 +1144,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
.migratepage = hugetlbfs_migrate_page,
.error_remove_page = hugetlbfs_error_remove_page,
};
diff --git a/fs/inode.c b/fs/inode.c
index 63324df6fa27..9d9b422504d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -259,7 +259,7 @@ static struct inode *alloc_inode(struct super_block *sb)
if (ops->alloc_inode)
inode = ops->alloc_inode(sb);
else
- inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+ inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);
if (!inode)
return NULL;
diff --git a/fs/internal.h b/fs/internal.h
index 8590c973c2f4..fb2c2ea807d7 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -158,11 +158,6 @@ extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
/*
- * read_write.c
- */
-extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
-
-/*
* pipe.c
*/
extern const struct file_operations pipefifo_fops;
@@ -184,7 +179,9 @@ int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+
+int getname_statx_lookup_flags(int flags);
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
/*
diff --git a/fs/io-wq.c b/fs/io-wq.c
index bb7f161bb19c..5b93fa67d346 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -76,6 +76,7 @@ struct io_wqe_acct {
unsigned max_workers;
int index;
atomic_t nr_running;
+ raw_spinlock_t lock;
struct io_wq_work_list work_list;
unsigned long flags;
};
@@ -91,7 +92,7 @@ enum {
*/
struct io_wqe {
raw_spinlock_t lock;
- struct io_wqe_acct acct[2];
+ struct io_wqe_acct acct[IO_WQ_ACCT_NR];
int node;
@@ -224,12 +225,12 @@ static void io_worker_exit(struct io_worker *worker)
if (worker->flags & IO_WORKER_F_FREE)
hlist_nulls_del_rcu(&worker->nulls_node);
list_del_rcu(&worker->all_list);
- preempt_disable();
+ raw_spin_unlock(&wqe->lock);
io_wqe_dec_running(worker);
worker->flags = 0;
+ preempt_disable();
current->flags &= ~PF_IO_WORKER;
preempt_enable();
- raw_spin_unlock(&wqe->lock);
kfree_rcu(worker, rcu);
io_worker_ref_put(wqe->wq);
@@ -238,10 +239,15 @@ static void io_worker_exit(struct io_worker *worker)
static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
{
+ bool ret = false;
+
+ raw_spin_lock(&acct->lock);
if (!wq_list_empty(&acct->work_list) &&
!test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- return true;
- return false;
+ ret = true;
+ raw_spin_unlock(&acct->lock);
+
+ return ret;
}
/*
@@ -385,7 +391,6 @@ fail:
}
static void io_wqe_dec_running(struct io_worker *worker)
- __must_hold(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -393,13 +398,14 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP))
return;
- if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- raw_spin_unlock(&wqe->lock);
- io_queue_worker_create(worker, acct, create_worker_cb);
- raw_spin_lock(&wqe->lock);
- }
+ if (!atomic_dec_and_test(&acct->nr_running))
+ return;
+ if (!io_acct_run_queue(acct))
+ return;
+
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ io_queue_worker_create(worker, acct, create_worker_cb);
}
/*
@@ -407,11 +413,12 @@ static void io_wqe_dec_running(struct io_worker *worker)
* it's currently on the freelist
*/
static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker)
- __must_hold(wqe->lock)
{
if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE;
+ raw_spin_lock(&wqe->lock);
hlist_nulls_del_init_rcu(&worker->nulls_node);
+ raw_spin_unlock(&wqe->lock);
}
}
@@ -456,7 +463,7 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker)
- __must_hold(wqe->lock)
+ __must_hold(acct->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail;
@@ -498,9 +505,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
unstalled = io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
if (unstalled) {
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wqe->wq->hash->wait))
@@ -538,7 +545,6 @@ static void io_assign_current_work(struct io_worker *worker,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker)
- __releases(wqe->lock)
{
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe;
@@ -555,7 +561,9 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will
* clear the stalled flag.
*/
+ raw_spin_lock(&acct->lock);
work = io_get_next_work(acct, worker);
+ raw_spin_unlock(&acct->lock);
if (work) {
__io_worker_busy(wqe, worker);
@@ -569,10 +577,9 @@ static void io_worker_handle_work(struct io_worker *worker)
raw_spin_lock(&worker->lock);
worker->next_work = work;
raw_spin_unlock(&worker->lock);
- }
- raw_spin_unlock(&wqe->lock);
- if (!work)
+ } else {
break;
+ }
io_assign_current_work(worker, work);
__set_current_state(TASK_RUNNING);
@@ -608,8 +615,6 @@ static void io_worker_handle_work(struct io_worker *worker)
wake_up(&wq->hash->wait);
}
} while (work);
-
- raw_spin_lock(&wqe->lock);
} while (1);
}
@@ -633,12 +638,10 @@ static int io_wqe_worker(void *data)
long ret;
set_current_state(TASK_INTERRUPTIBLE);
-loop:
- raw_spin_lock(&wqe->lock);
- if (io_acct_run_queue(acct)) {
+ while (io_acct_run_queue(acct))
io_worker_handle_work(worker);
- goto loop;
- }
+
+ raw_spin_lock(&wqe->lock);
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
acct->nr_workers--;
@@ -662,10 +665,8 @@ loop:
last_timeout = !ret;
}
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock(&wqe->lock);
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state))
io_worker_handle_work(worker);
- }
audit_free(current);
io_worker_exit(worker);
@@ -705,10 +706,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
return;
worker->flags &= ~IO_WORKER_F_RUNNING;
-
- raw_spin_lock(&worker->wqe->lock);
io_wqe_dec_running(worker);
- raw_spin_unlock(&worker->wqe->lock);
}
static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
@@ -778,10 +776,12 @@ static void create_worker_cont(struct callback_head *cb)
.cancel_all = true,
};
+ raw_spin_unlock(&wqe->lock);
while (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ ;
+ } else {
+ raw_spin_unlock(&wqe->lock);
}
- raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
kfree(worker);
return;
@@ -914,6 +914,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ struct io_cb_cancel_data match;
unsigned work_flags = work->flags;
bool do_create;
@@ -927,10 +928,12 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
- raw_spin_lock(&wqe->lock);
+ raw_spin_lock(&acct->lock);
io_wqe_insert_work(wqe, work);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ raw_spin_unlock(&acct->lock);
+ raw_spin_lock(&wqe->lock);
rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
@@ -946,18 +949,18 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
raw_spin_lock(&wqe->lock);
- /* fatal condition, failed to create the first worker */
- if (!acct->nr_workers) {
- struct io_cb_cancel_data match = {
- .fn = io_wq_work_match_item,
- .data = work,
- .cancel_all = false,
- };
-
- if (io_acct_cancel_pending_work(wqe, acct, &match))
- raw_spin_lock(&wqe->lock);
+ if (acct->nr_workers) {
+ raw_spin_unlock(&wqe->lock);
+ return;
}
raw_spin_unlock(&wqe->lock);
+
+ /* fatal condition, failed to create the first worker */
+ match.fn = io_wq_work_match_item,
+ match.data = work,
+ match.cancel_all = false,
+
+ io_acct_cancel_pending_work(wqe, acct, &match);
}
}
@@ -1032,22 +1035,23 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match)
- __releases(wqe->lock)
{
struct io_wq_work_node *node, *prev;
struct io_wq_work *work;
+ raw_spin_lock(&acct->lock);
wq_list_for_each(node, prev, &acct->work_list) {
work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data))
continue;
io_wqe_remove_pending(wqe, work, prev);
- raw_spin_unlock(&wqe->lock);
+ raw_spin_unlock(&acct->lock);
io_run_cancel(work, wqe);
match->nr_pending++;
/* not safe to continue after unlock */
return true;
}
+ raw_spin_unlock(&acct->lock);
return false;
}
@@ -1061,7 +1065,6 @@ retry:
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
- raw_spin_lock(&wqe->lock);
if (match->cancel_all)
goto retry;
break;
@@ -1103,13 +1106,11 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
for_each_node(node) {
struct io_wqe *wqe = wq->wqes[node];
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- if (match.nr_pending && !match.cancel_all) {
- raw_spin_unlock(&wqe->lock);
+ if (match.nr_pending && !match.cancel_all)
return IO_WQ_CANCEL_OK;
- }
+ raw_spin_lock(&wqe->lock);
io_wqe_cancel_running_work(wqe, &match);
raw_spin_unlock(&wqe->lock);
if (match.nr_running && !match.cancel_all)
@@ -1190,6 +1191,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
+ raw_spin_lock_init(&acct->lock);
}
wqe->wq = wq;
raw_spin_lock_init(&wqe->lock);
@@ -1282,9 +1284,7 @@ static void io_wq_destroy(struct io_wq *wq)
.fn = io_wq_work_match_all,
.cancel_all = true,
};
- raw_spin_lock(&wqe->lock);
io_wqe_cancel_pending_work(wqe, &match);
- raw_spin_unlock(&wqe->lock);
free_cpumask_var(wqe->cpu_mask);
kfree(wqe);
}
@@ -1376,7 +1376,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND);
BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2);
- for (i = 0; i < 2; i++) {
+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
if (new_count[i] > task_rlimit(current, RLIMIT_NPROC))
new_count[i] = task_rlimit(current, RLIMIT_NPROC);
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2e04f718319d..59e54a6854b7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -63,6 +63,7 @@
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/scm.h>
+#include <net/busy_poll.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -263,11 +264,18 @@ struct io_rsrc_data {
bool quiesce;
};
+struct io_buffer_list {
+ struct list_head list;
+ struct list_head buf_list;
+ __u16 bgid;
+};
+
struct io_buffer {
struct list_head list;
__u64 addr;
__u32 len;
__u16 bid;
+ __u16 bgid;
};
struct io_restriction {
@@ -326,6 +334,14 @@ struct io_submit_state {
struct blk_plug plug;
};
+struct io_ev_fd {
+ struct eventfd_ctx *cq_ev_fd;
+ unsigned int eventfd_async: 1;
+ struct rcu_head rcu;
+};
+
+#define IO_BUFFERS_HASH_BITS 5
+
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
@@ -335,11 +351,11 @@ struct io_ring_ctx {
unsigned int flags;
unsigned int compat: 1;
unsigned int drain_next: 1;
- unsigned int eventfd_async: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
+ unsigned int has_evfd: 1;
} ____cacheline_aligned_in_smp;
/* submission data */
@@ -378,7 +394,9 @@ struct io_ring_ctx {
struct list_head timeout_list;
struct list_head ltimeout_list;
struct list_head cq_overflow_list;
- struct xarray io_buffers;
+ struct list_head *io_buffers;
+ struct list_head io_buffers_cache;
+ struct list_head apoll_cache;
struct xarray personalities;
u32 pers_next;
unsigned sq_thread_idle;
@@ -395,11 +413,16 @@ struct io_ring_ctx {
struct list_head sqd_list;
unsigned long check_cq_overflow;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ /* used to track busy poll napi_id */
+ struct list_head napi_list;
+ spinlock_t napi_lock; /* napi_list lock */
+#endif
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
- struct eventfd_ctx *cq_ev_fd;
+ struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
atomic_t cq_timeouts;
@@ -421,6 +444,8 @@ struct io_ring_ctx {
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_queue;
+
+ struct list_head io_buffers_comp;
} ____cacheline_aligned_in_smp;
struct io_restriction restrictions;
@@ -436,6 +461,8 @@ struct io_ring_ctx {
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
+
+ struct list_head io_buffers_pages;
};
/* Keep this last, we don't need it for the fast path */
@@ -461,6 +488,11 @@ struct io_ring_ctx {
};
};
+/*
+ * Arbitrary limit, can be raised if need be
+ */
+#define IO_RINGFD_REG_MAX 16
+
struct io_uring_task {
/* submission side */
int cached_refs;
@@ -476,6 +508,7 @@ struct io_uring_task {
struct io_wq_work_list task_list;
struct io_wq_work_list prior_task_list;
struct callback_head task_work;
+ struct file **registered_rings;
bool task_running;
};
@@ -642,7 +675,7 @@ struct io_statx {
int dfd;
unsigned int mask;
unsigned int flags;
- const char __user *filename;
+ struct filename *filename;
struct statx __user *buffer;
};
@@ -690,6 +723,12 @@ struct io_hardlink {
int flags;
};
+struct io_msg {
+ struct file *file;
+ u64 user_data;
+ u32 len;
+};
+
struct io_async_connect {
struct sockaddr_storage address;
};
@@ -741,6 +780,8 @@ enum {
REQ_F_ARM_LTIMEOUT_BIT,
REQ_F_ASYNC_DATA_BIT,
REQ_F_SKIP_LINK_CQES_BIT,
+ REQ_F_SINGLE_POLL_BIT,
+ REQ_F_DOUBLE_POLL_BIT,
/* keep async read/write and isreg together and in order */
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
@@ -799,6 +840,10 @@ enum {
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+ /* single poll may be active */
+ REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
+ /* double poll may active */
+ REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
};
struct async_poll {
@@ -825,7 +870,7 @@ enum {
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
* access the file pointer through any of the sub-structs,
- * or directly as just 'ki_filp' in this struct.
+ * or directly as just 'file' in this struct.
*/
struct io_kiocb {
union {
@@ -855,6 +900,7 @@ struct io_kiocb {
struct io_mkdir mkdir;
struct io_symlink symlink;
struct io_hardlink hardlink;
+ struct io_msg msg;
};
u8 opcode;
@@ -877,6 +923,7 @@ struct io_kiocb {
/* used by request caches, completion batching and iopoll */
struct io_wq_work_node comp_list;
atomic_t refs;
+ atomic_t poll_refs;
struct io_kiocb *link;
struct io_task_work io_task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
@@ -885,12 +932,11 @@ struct io_kiocb {
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
void *async_data;
- struct io_wq_work work;
/* custom credentials, valid IFF REQ_F_CREDS is set */
- const struct cred *creds;
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
struct io_buffer *kbuf;
- atomic_t poll_refs;
+ const struct cred *creds;
+ struct io_wq_work work;
};
struct io_tctx_node {
@@ -1105,6 +1151,9 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_MKDIRAT] = {},
[IORING_OP_SYMLINKAT] = {},
[IORING_OP_LINKAT] = {},
+ [IORING_OP_MSG_RING] = {
+ .needs_file = 1,
+ },
};
/* requests with any of those set should undergo io_disarm_next() */
@@ -1141,6 +1190,7 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file,
static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags);
static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer);
+static void io_eventfd_signal(struct io_ring_ctx *ctx);
static struct kmem_cache *req_cachep;
@@ -1267,36 +1317,88 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
}
}
-static unsigned int __io_put_kbuf(struct io_kiocb *req)
+static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list)
{
struct io_buffer *kbuf = req->kbuf;
unsigned int cflags;
- cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
- cflags |= IORING_CQE_F_BUFFER;
+ cflags = IORING_CQE_F_BUFFER | (kbuf->bid << IORING_CQE_BUFFER_SHIFT);
req->flags &= ~REQ_F_BUFFER_SELECTED;
- kfree(kbuf);
+ list_add(&kbuf->list, list);
req->kbuf = NULL;
return cflags;
}
-static inline unsigned int io_put_kbuf(struct io_kiocb *req)
+static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
return 0;
- return __io_put_kbuf(req);
+ return __io_put_kbuf(req, &req->ctx->io_buffers_comp);
}
-static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl)
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+ unsigned issue_flags)
{
- bool got = percpu_ref_tryget(ref);
+ unsigned int cflags;
- /* already at zero, wait for ->release() */
- if (!got)
- wait_for_completion(compl);
- percpu_ref_resurrect(ref);
- if (got)
- percpu_ref_put(ref);
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return 0;
+
+ /*
+ * We can add this buffer back to two lists:
+ *
+ * 1) The io_buffers_cache list. This one is protected by the
+ * ctx->uring_lock. If we already hold this lock, add back to this
+ * list as we can grab it from issue as well.
+ * 2) The io_buffers_comp list. This one is protected by the
+ * ctx->completion_lock.
+ *
+ * We migrate buffers from the comp_list to the issue cache list
+ * when we need one.
+ */
+ if (issue_flags & IO_URING_F_UNLOCKED) {
+ struct io_ring_ctx *ctx = req->ctx;
+
+ spin_lock(&ctx->completion_lock);
+ cflags = __io_put_kbuf(req, &ctx->io_buffers_comp);
+ spin_unlock(&ctx->completion_lock);
+ } else {
+ cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache);
+ }
+
+ return cflags;
+}
+
+static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
+ unsigned int bgid)
+{
+ struct list_head *hash_list;
+ struct io_buffer_list *bl;
+
+ hash_list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ list_for_each_entry(bl, hash_list, list)
+ if (bl->bgid == bgid || bgid == -1U)
+ return bl;
+
+ return NULL;
+}
+
+static void io_kbuf_recycle(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
+ struct io_buffer *buf;
+
+ if (likely(!(req->flags & REQ_F_BUFFER_SELECTED)))
+ return;
+
+ lockdep_assert_held(&ctx->uring_lock);
+
+ buf = req->kbuf;
+ bl = io_buffer_get_list(ctx, buf->bgid);
+ list_add(&buf->list, &bl->buf_list);
+ req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
}
static bool io_match_task(struct io_kiocb *head, struct task_struct *task,
@@ -1409,7 +1511,7 @@ static __cold void io_fallback_req_func(struct work_struct *work)
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{
struct io_ring_ctx *ctx;
- int hash_bits;
+ int i, hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
@@ -1436,6 +1538,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
/* set invalid range, so io_import_fixed() fails meeting it */
ctx->dummy_ubuf->ubuf = -1UL;
+ ctx->io_buffers = kcalloc(1U << IO_BUFFERS_HASH_BITS,
+ sizeof(struct list_head), GFP_KERNEL);
+ if (!ctx->io_buffers)
+ goto err;
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++)
+ INIT_LIST_HEAD(&ctx->io_buffers[i]);
+
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err;
@@ -1444,14 +1553,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->sqo_sq_wait);
INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_cache);
+ INIT_LIST_HEAD(&ctx->apoll_cache);
init_completion(&ctx->ref_comp);
- xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1);
xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->cq_wait);
spin_lock_init(&ctx->completion_lock);
spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
+ INIT_LIST_HEAD(&ctx->io_buffers_pages);
+ INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
INIT_LIST_HEAD(&ctx->ltimeout_list);
@@ -1464,10 +1576,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->locked_free_list);
INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func);
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ INIT_LIST_HEAD(&ctx->napi_list);
+ spin_lock_init(&ctx->napi_lock);
+#endif
return ctx;
err:
kfree(ctx->dummy_ubuf);
kfree(ctx->cancel_hash);
+ kfree(ctx->io_buffers);
kfree(ctx);
return NULL;
}
@@ -1610,8 +1727,8 @@ static void io_queue_async_work(struct io_kiocb *req, bool *dont_use)
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
- trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
- &req->work, req->flags);
+ trace_io_uring_queue_async_work(ctx, req, req->user_data, req->opcode, req->flags,
+ &req->work, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
if (link)
io_queue_linked_timeout(link);
@@ -1681,22 +1798,27 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
-static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
-{
- if (ctx->off_timeout_used)
- io_flush_timeouts(ctx);
- if (ctx->drain_active)
- io_queue_deferred(ctx);
-}
-
static inline void io_commit_cqring(struct io_ring_ctx *ctx)
{
- if (unlikely(ctx->off_timeout_used || ctx->drain_active))
- __io_commit_cqring_flush(ctx);
/* order cqe stores with ring update */
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
+static void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
+{
+ if (ctx->off_timeout_used || ctx->drain_active) {
+ spin_lock(&ctx->completion_lock);
+ if (ctx->off_timeout_used)
+ io_flush_timeouts(ctx);
+ if (ctx->drain_active)
+ io_queue_deferred(ctx);
+ io_commit_cqring(ctx);
+ spin_unlock(&ctx->completion_lock);
+ }
+ if (ctx->has_evfd)
+ io_eventfd_signal(ctx);
+}
+
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
@@ -1726,23 +1848,34 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
return &rings->cqes[tail & mask];
}
-static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
+static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
- if (likely(!ctx->cq_ev_fd))
- return false;
+ struct io_ev_fd *ev_fd;
+
+ rcu_read_lock();
+ /*
+ * rcu_dereference ctx->io_ev_fd once and use it for both for checking
+ * and eventfd_signal
+ */
+ ev_fd = rcu_dereference(ctx->io_ev_fd);
+
+ /*
+ * Check again if ev_fd exists incase an io_eventfd_unregister call
+ * completed between the NULL check of ctx->io_ev_fd at the start of
+ * the function and rcu_read_lock.
+ */
+ if (unlikely(!ev_fd))
+ goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return false;
- return !ctx->eventfd_async || io_wq_current_is_worker();
+ goto out;
+
+ if (!ev_fd->eventfd_async || io_wq_current_is_worker())
+ eventfd_signal(ev_fd->cq_ev_fd, 1);
+out:
+ rcu_read_unlock();
}
-/*
- * This should only get called when at least one event has been posted.
- * Some applications rely on the eventfd notification count only changing
- * IFF a new CQE has been added to the CQ ring. There's no depedency on
- * 1:1 relationship between how many times this function is called (and
- * hence the eventfd count) and number of CQEs posted to the CQ ring.
- */
-static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* wake_up_all() may seem excessive, but io_wake_function() and
@@ -1751,21 +1884,32 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
*/
if (wq_has_sleeper(&ctx->cq_wait))
wake_up_all(&ctx->cq_wait);
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+}
+
+/*
+ * This should only get called when at least one event has been posted.
+ * Some applications rely on the eventfd notification count only changing
+ * IFF a new CQE has been added to the CQ ring. There's no depedency on
+ * 1:1 relationship between how many times this function is called (and
+ * hence the eventfd count) and number of CQEs posted to the CQ ring.
+ */
+static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx)
+{
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
+
+ io_cqring_wake(ctx);
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
- /* see waitqueue_active() comment */
- smp_mb();
+ if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+ ctx->has_evfd))
+ __io_commit_cqring_flush(ctx);
- if (ctx->flags & IORING_SETUP_SQPOLL) {
- if (waitqueue_active(&ctx->cq_wait))
- wake_up_all(&ctx->cq_wait);
- }
- if (io_should_trigger_evfd(ctx))
- eventfd_signal(ctx->cq_ev_fd, 1);
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ io_cqring_wake(ctx);
}
/* Returns true if there are no backlogged entries after the flush */
@@ -1905,8 +2049,6 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
{
struct io_uring_cqe *cqe;
- trace_io_uring_complete(ctx, user_data, res, cflags);
-
/*
* If we can't get a cq entry, userspace overflowed the
* submission (by quite a lot). Increment the overflow count in
@@ -1922,16 +2064,23 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
return io_cqring_event_overflow(ctx, user_data, res, cflags);
}
+static inline bool __io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
+{
+ trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
+ return __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+}
+
static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
{
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(req->ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
}
static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags)
{
ctx->cq_extra++;
+ trace_io_uring_complete(ctx, NULL, user_data, res, cflags);
return __io_fill_cqe(ctx, user_data, res, cflags);
}
@@ -1941,7 +2090,7 @@ static void __io_req_complete_post(struct io_kiocb *req, s32 res,
struct io_ring_ctx *ctx = req->ctx;
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, res, cflags);
+ __io_fill_cqe_req(req, res, cflags);
/*
* If we're the last reference to this request, add to our locked
* free_list cache.
@@ -2000,7 +2149,7 @@ static inline void io_req_complete(struct io_kiocb *req, s32 res)
static void io_req_complete_failed(struct io_kiocb *req, s32 res)
{
req_set_fail(req);
- io_req_complete_post(req, res, 0);
+ io_req_complete_post(req, res, io_put_kbuf(req, 0));
}
static void io_req_complete_fail_submit(struct io_kiocb *req)
@@ -2183,7 +2332,9 @@ static void io_fail_links(struct io_kiocb *req)
nxt = link->link;
link->link = NULL;
- trace_io_uring_fail_link(req, link);
+ trace_io_uring_fail_link(req->ctx, req, req->user_data,
+ req->opcode, link);
+
if (!ignore_cqes) {
link->flags &= ~REQ_F_CQE_SKIP;
io_fill_cqe_req(link, res, 0);
@@ -2302,7 +2453,8 @@ static void handle_prev_tw_list(struct io_wq_work_node *node,
if (likely(*uring_locked))
req->io_task_work.func(req, uring_locked);
else
- __io_req_complete_post(req, req->result, io_put_kbuf(req));
+ __io_req_complete_post(req, req->result,
+ io_put_kbuf_comp(req));
node = next;
} while (node);
@@ -2530,8 +2682,16 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
comp_list);
if (!(req->flags & REQ_F_CQE_SKIP))
- __io_fill_cqe(ctx, req->user_data, req->result,
- req->cflags);
+ __io_fill_cqe_req(req, req->result, req->cflags);
+ if ((req->flags & REQ_F_POLLED) && req->apoll) {
+ struct async_poll *apoll = req->apoll;
+
+ if (apoll->double_poll)
+ kfree(apoll->double_poll);
+ list_add(&apoll->poll.wait.entry,
+ &ctx->apoll_cache);
+ req->flags &= ~REQ_F_POLLED;
+ }
}
io_commit_cqring(ctx);
@@ -2653,7 +2813,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
- __io_fill_cqe(ctx, req->user_data, req->result, io_put_kbuf(req));
+ __io_fill_cqe_req(req, req->result, io_put_kbuf(req, 0));
nr_events++;
}
@@ -2829,14 +2989,14 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
static inline void io_req_task_complete(struct io_kiocb *req, bool *locked)
{
- unsigned int cflags = io_put_kbuf(req);
int res = req->result;
if (*locked) {
- io_req_complete_state(req, res, cflags);
+ io_req_complete_state(req, res, io_put_kbuf(req, 0));
io_req_add_compl_list(req);
} else {
- io_req_complete_post(req, res, cflags);
+ io_req_complete_post(req, res,
+ io_put_kbuf(req, IO_URING_F_UNLOCKED));
}
}
@@ -2845,7 +3005,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res,
{
if (__io_complete_rw_common(req, res))
return;
- __io_req_complete(req, issue_flags, req->result, io_put_kbuf(req));
+ __io_req_complete(req, issue_flags, req->result,
+ io_put_kbuf(req, issue_flags));
}
static void io_complete_rw(struct kiocb *kiocb, long res)
@@ -3000,14 +3161,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
kiocb->ki_pos = READ_ONCE(sqe->off);
- if (kiocb->ki_pos == -1) {
- if (!(file->f_mode & FMODE_STREAM)) {
- req->flags |= REQ_F_CUR_POS;
- kiocb->ki_pos = file->f_pos;
- } else {
- kiocb->ki_pos = 0;
- }
- }
kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags));
if (unlikely(ret))
@@ -3074,6 +3227,24 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
}
}
+static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
+{
+ struct kiocb *kiocb = &req->rw.kiocb;
+ bool is_stream = req->file->f_mode & FMODE_STREAM;
+
+ if (kiocb->ki_pos == -1) {
+ if (!is_stream) {
+ req->flags |= REQ_F_CUR_POS;
+ kiocb->ki_pos = req->file->f_pos;
+ return &kiocb->ki_pos;
+ } else {
+ kiocb->ki_pos = 0;
+ return NULL;
+ }
+ }
+ return is_stream ? NULL : &kiocb->ki_pos;
+}
+
static void kiocb_done(struct io_kiocb *req, ssize_t ret,
unsigned int issue_flags)
{
@@ -3096,14 +3267,10 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret,
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
- if (io_resubmit_prep(req)) {
+ if (io_resubmit_prep(req))
io_req_task_queue_reissue(req);
- } else {
- req_set_fail(req);
- req->result = ret;
- req->io_task_work.func = io_req_task_complete;
- io_req_task_work_add(req, false);
- }
+ else
+ io_req_task_queue_fail(req, ret);
}
}
@@ -3201,30 +3368,36 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock)
mutex_lock(&ctx->uring_lock);
}
+static void io_buffer_add_list(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned int bgid)
+{
+ struct list_head *list;
+
+ list = &ctx->io_buffers[hash_32(bgid, IO_BUFFERS_HASH_BITS)];
+ INIT_LIST_HEAD(&bl->buf_list);
+ bl->bgid = bgid;
+ list_add(&bl->list, list);
+}
+
static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len,
int bgid, unsigned int issue_flags)
{
struct io_buffer *kbuf = req->kbuf;
- struct io_buffer *head;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_buffer_list *bl;
if (req->flags & REQ_F_BUFFER_SELECTED)
return kbuf;
- io_ring_submit_lock(req->ctx, needs_lock);
+ io_ring_submit_lock(ctx, needs_lock);
- lockdep_assert_held(&req->ctx->uring_lock);
+ lockdep_assert_held(&ctx->uring_lock);
- head = xa_load(&req->ctx->io_buffers, bgid);
- if (head) {
- if (!list_empty(&head->list)) {
- kbuf = list_last_entry(&head->list, struct io_buffer,
- list);
- list_del(&kbuf->list);
- } else {
- kbuf = head;
- xa_erase(&req->ctx->io_buffers, bgid);
- }
+ bl = io_buffer_get_list(ctx, bgid);
+ if (bl && !list_empty(&bl->buf_list)) {
+ kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list);
+ list_del(&kbuf->list);
if (*len > kbuf->len)
*len = kbuf->len;
req->flags |= REQ_F_BUFFER_SELECTED;
@@ -3400,6 +3573,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
struct kiocb *kiocb = &req->rw.kiocb;
struct file *file = req->file;
ssize_t ret = 0;
+ loff_t *ppos;
/*
* Don't support polled IO through this interface, and we can't
@@ -3412,6 +3586,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
!(kiocb->ki_filp->f_flags & O_NONBLOCK))
return -EAGAIN;
+ ppos = io_kiocb_ppos(kiocb);
+
while (iov_iter_count(iter)) {
struct iovec iovec;
ssize_t nr;
@@ -3425,10 +3601,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
if (rw == READ) {
nr = file->f_op->read(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
- iovec.iov_len, io_kiocb_ppos(kiocb));
+ iovec.iov_len, ppos);
}
if (nr < 0) {
@@ -3436,13 +3612,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter)
ret = nr;
break;
}
+ ret += nr;
if (!iov_iter_is_bvec(iter)) {
iov_iter_advance(iter, nr);
} else {
- req->rw.len -= nr;
req->rw.addr += nr;
+ req->rw.len -= nr;
+ if (!req->rw.len)
+ break;
}
- ret += nr;
if (nr != iovec.iov_len)
break;
}
@@ -3629,12 +3807,23 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct io_async_rw *rw;
ssize_t ret, ret2;
+ loff_t *ppos;
if (!req_has_async_data(req)) {
ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
} else {
+ /*
+ * Safe and required to re-import if we're using provided
+ * buffers, as we dropped the selected one before retry.
+ */
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ if (unlikely(ret < 0))
+ return ret;
+ }
+
rw = req->async_data;
s = &rw->s;
/*
@@ -3659,7 +3848,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(READ, req->file, ppos, req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@@ -3669,6 +3860,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE;
+ /* if we can poll, just do that */
+ if (req->opcode == IORING_OP_READ && file_can_poll(req->file))
+ return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
@@ -3747,7 +3941,6 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
- req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file));
return io_prep_rw(req, sqe);
}
@@ -3758,6 +3951,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
struct kiocb *kiocb = &req->rw.kiocb;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
ssize_t ret, ret2;
+ loff_t *ppos;
if (!req_has_async_data(req)) {
ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
@@ -3788,7 +3982,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
kiocb->ki_flags &= ~IOCB_NOWAIT;
}
- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
+ ppos = io_kiocb_update_pos(req);
+
+ ret = rw_verify_area(WRITE, req->file, ppos, req->result);
if (unlikely(ret))
goto out_free;
@@ -4235,6 +4431,45 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags)
return 0;
}
+static int io_msg_ring_prep(struct io_kiocb *req,
+ const struct io_uring_sqe *sqe)
+{
+ if (unlikely(sqe->addr || sqe->ioprio || sqe->rw_flags ||
+ sqe->splice_fd_in || sqe->buf_index || sqe->personality))
+ return -EINVAL;
+
+ if (req->file->f_op != &io_uring_fops)
+ return -EBADFD;
+
+ req->msg.user_data = READ_ONCE(sqe->off);
+ req->msg.len = READ_ONCE(sqe->len);
+ return 0;
+}
+
+static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *target_ctx;
+ struct io_msg *msg = &req->msg;
+ int ret = -EOVERFLOW;
+ bool filled;
+
+ target_ctx = req->file->private_data;
+
+ spin_lock(&target_ctx->completion_lock);
+ filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len,
+ IORING_CQE_F_MSG);
+ io_commit_cqring(target_ctx);
+ spin_unlock(&target_ctx->completion_lock);
+
+ if (filled) {
+ io_cqring_ev_posted(target_ctx);
+ ret = 0;
+ }
+
+ __io_req_complete(req, issue_flags, ret, 0);
+ return 0;
+}
+
static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -4458,8 +4693,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
- int bgid, unsigned nbufs)
+static int __io_remove_buffers(struct io_ring_ctx *ctx,
+ struct io_buffer_list *bl, unsigned nbufs)
{
unsigned i = 0;
@@ -4468,19 +4703,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
return 0;
/* the head kbuf is the list itself */
- while (!list_empty(&buf->list)) {
+ while (!list_empty(&bl->buf_list)) {
struct io_buffer *nxt;
- nxt = list_first_entry(&buf->list, struct io_buffer, list);
+ nxt = list_first_entry(&bl->buf_list, struct io_buffer, list);
list_del(&nxt->list);
- kfree(nxt);
if (++i == nbufs)
return i;
cond_resched();
}
i++;
- kfree(buf);
- xa_erase(&ctx->io_buffers, bgid);
return i;
}
@@ -4489,7 +4721,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head;
+ struct io_buffer_list *bl;
int ret = 0;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
@@ -4498,9 +4730,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
lockdep_assert_held(&ctx->uring_lock);
ret = -ENOENT;
- head = xa_load(&ctx->io_buffers, p->bgid);
- if (head)
- ret = __io_remove_buffers(ctx, head, p->bgid, p->nbufs);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (bl)
+ ret = __io_remove_buffers(ctx, bl, p->nbufs);
if (ret < 0)
req_set_fail(req);
@@ -4545,38 +4777,80 @@ static int io_provide_buffers_prep(struct io_kiocb *req,
return 0;
}
-static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
+static int io_refill_buffer_cache(struct io_ring_ctx *ctx)
+{
+ struct io_buffer *buf;
+ struct page *page;
+ int bufs_in_page;
+
+ /*
+ * Completions that don't happen inline (eg not under uring_lock) will
+ * add to ->io_buffers_comp. If we don't have any free buffers, check
+ * the completion list and splice those entries first.
+ */
+ if (!list_empty_careful(&ctx->io_buffers_comp)) {
+ spin_lock(&ctx->completion_lock);
+ if (!list_empty(&ctx->io_buffers_comp)) {
+ list_splice_init(&ctx->io_buffers_comp,
+ &ctx->io_buffers_cache);
+ spin_unlock(&ctx->completion_lock);
+ return 0;
+ }
+ spin_unlock(&ctx->completion_lock);
+ }
+
+ /*
+ * No free buffers and no completion entries either. Allocate a new
+ * page worth of buffer entries and add those to our freelist.
+ */
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
+ if (!page)
+ return -ENOMEM;
+
+ list_add(&page->lru, &ctx->io_buffers_pages);
+
+ buf = page_address(page);
+ bufs_in_page = PAGE_SIZE / sizeof(*buf);
+ while (bufs_in_page) {
+ list_add_tail(&buf->list, &ctx->io_buffers_cache);
+ buf++;
+ bufs_in_page--;
+ }
+
+ return 0;
+}
+
+static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf,
+ struct io_buffer_list *bl)
{
struct io_buffer *buf;
u64 addr = pbuf->addr;
int i, bid = pbuf->bid;
for (i = 0; i < pbuf->nbufs; i++) {
- buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT);
- if (!buf)
+ if (list_empty(&ctx->io_buffers_cache) &&
+ io_refill_buffer_cache(ctx))
break;
-
+ buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer,
+ list);
+ list_move_tail(&buf->list, &bl->buf_list);
buf->addr = addr;
buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
buf->bid = bid;
+ buf->bgid = pbuf->bgid;
addr += pbuf->len;
bid++;
- if (!*head) {
- INIT_LIST_HEAD(&buf->list);
- *head = buf;
- } else {
- list_add_tail(&buf->list, &(*head)->list);
- }
+ cond_resched();
}
- return i ? i : -ENOMEM;
+ return i ? 0 : -ENOMEM;
}
static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_provide_buf *p = &req->pbuf;
struct io_ring_ctx *ctx = req->ctx;
- struct io_buffer *head, *list;
+ struct io_buffer_list *bl;
int ret = 0;
bool needs_lock = issue_flags & IO_URING_F_UNLOCKED;
@@ -4584,14 +4858,18 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
lockdep_assert_held(&ctx->uring_lock);
- list = head = xa_load(&ctx->io_buffers, p->bgid);
-
- ret = io_add_buffers(p, &head);
- if (ret >= 0 && !list) {
- ret = xa_insert(&ctx->io_buffers, p->bgid, head, GFP_KERNEL);
- if (ret < 0)
- __io_remove_buffers(ctx, head, p->bgid, -1U);
+ bl = io_buffer_get_list(ctx, p->bgid);
+ if (unlikely(!bl)) {
+ bl = kmalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ io_buffer_add_list(ctx, bl, p->bgid);
}
+
+ ret = io_add_buffers(ctx, p, bl);
+err:
if (ret < 0)
req_set_fail(req);
/* complete before unlock, IOPOLL may need the lock */
@@ -4721,6 +4999,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
+ const char __user *path;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in)
@@ -4730,10 +5010,22 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->statx.dfd = READ_ONCE(sqe->fd);
req->statx.mask = READ_ONCE(sqe->len);
- req->statx.filename = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ path = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
req->statx.flags = READ_ONCE(sqe->statx_flags);
+ req->statx.filename = getname_flags(path,
+ getname_statx_lookup_flags(req->statx.flags),
+ NULL);
+
+ if (IS_ERR(req->statx.filename)) {
+ int ret = PTR_ERR(req->statx.filename);
+
+ req->statx.filename = NULL;
+ return ret;
+ }
+
+ req->flags |= REQ_F_NEED_CLEANUP;
return 0;
}
@@ -5183,7 +5475,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
if (kmsg->free_iov)
kfree(kmsg->free_iov);
req->flags &= ~REQ_F_NEED_CLEANUP;
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5228,7 +5520,6 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&msg.msg_iter);
ret = sock_recvmsg(sock, &msg, flags);
-out_free:
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock)
return -EAGAIN;
@@ -5236,10 +5527,11 @@ out_free:
ret = -EINTR;
req_set_fail(req);
} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+out_free:
req_set_fail(req);
}
- __io_req_complete(req, issue_flags, ret, io_put_kbuf(req));
+ __io_req_complete(req, issue_flags, ret, io_put_kbuf(req, issue_flags));
return 0;
}
@@ -5258,8 +5550,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
accept->nofile = rlimit(RLIMIT_NOFILE);
accept->file_slot = READ_ONCE(sqe->file_index);
- if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) ||
- (accept->flags & SOCK_CLOEXEC)))
+ if (accept->file_slot && (accept->flags & SOCK_CLOEXEC))
return -EINVAL;
if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
@@ -5399,6 +5690,108 @@ IO_NETOP_FN(send);
IO_NETOP_FN(recv);
#endif /* CONFIG_NET */
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
+#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
+
+struct napi_entry {
+ struct list_head list;
+ unsigned int napi_id;
+ unsigned long timeout;
+};
+
+/*
+ * Add busy poll NAPI ID from sk.
+ */
+static void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+ unsigned int napi_id;
+ struct socket *sock;
+ struct sock *sk;
+ struct napi_entry *ne;
+
+ if (!net_busy_loop_on())
+ return;
+
+ sock = sock_from_file(file);
+ if (!sock)
+ return;
+
+ sk = sock->sk;
+ if (!sk)
+ return;
+
+ napi_id = READ_ONCE(sk->sk_napi_id);
+
+ /* Non-NAPI IDs can be rejected */
+ if (napi_id < MIN_NAPI_ID)
+ return;
+
+ spin_lock(&ctx->napi_lock);
+ list_for_each_entry(ne, &ctx->napi_list, list) {
+ if (ne->napi_id == napi_id) {
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ goto out;
+ }
+ }
+
+ ne = kmalloc(sizeof(*ne), GFP_NOWAIT);
+ if (!ne)
+ goto out;
+
+ ne->napi_id = napi_id;
+ ne->timeout = jiffies + NAPI_TIMEOUT;
+ list_add_tail(&ne->list, &ctx->napi_list);
+out:
+ spin_unlock(&ctx->napi_lock);
+}
+
+static inline void io_check_napi_entry_timeout(struct napi_entry *ne)
+{
+ if (time_after(jiffies, ne->timeout)) {
+ list_del(&ne->list);
+ kfree(ne);
+ }
+}
+
+/*
+ * Busy poll if globally on and supporting sockets found
+ */
+static bool io_napi_busy_loop(struct list_head *napi_list)
+{
+ struct napi_entry *ne, *n;
+
+ list_for_each_entry_safe(ne, n, napi_list, list) {
+ napi_busy_loop(ne->napi_id, NULL, NULL, true,
+ BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ }
+ return !list_empty(napi_list);
+}
+
+static void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+ spin_lock(&ctx->napi_lock);
+ while (!list_empty(&ctx->napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(&ctx->napi_list, struct napi_entry,
+ list);
+
+ list_del(&ne->list);
+ kfree(ne);
+ }
+ spin_unlock(&ctx->napi_lock);
+}
+#else
+static inline void io_add_napi(struct file *file, struct io_ring_ctx *ctx)
+{
+}
+
+static inline void io_free_napi_list(struct io_ring_ctx *ctx)
+{
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
struct io_poll_table {
struct poll_table_struct pt;
struct io_kiocb *req;
@@ -5474,8 +5867,12 @@ static inline void io_poll_remove_entry(struct io_poll_iocb *poll)
static void io_poll_remove_entries(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = io_poll_get_single(req);
- struct io_poll_iocb *poll_double = io_poll_get_double(req);
+ /*
+ * Nothing to do if neither of those flags are set. Avoid dipping
+ * into the poll/apoll/double cachelines if we can.
+ */
+ if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL)))
+ return;
/*
* While we hold the waitqueue lock and the waitqueue is nonempty,
@@ -5493,9 +5890,10 @@ static void io_poll_remove_entries(struct io_kiocb *req)
* In that case, only RCU prevents the queue memory from being freed.
*/
rcu_read_lock();
- io_poll_remove_entry(poll);
- if (poll_double)
- io_poll_remove_entry(poll_double);
+ if (req->flags & REQ_F_SINGLE_POLL)
+ io_poll_remove_entry(io_poll_get_single(req));
+ if (req->flags & REQ_F_DOUBLE_POLL)
+ io_poll_remove_entry(io_poll_get_double(req));
rcu_read_unlock();
}
@@ -5527,13 +5925,13 @@ static int io_poll_check_events(struct io_kiocb *req)
return -ECANCELED;
if (!req->result) {
- struct poll_table_struct pt = { ._key = poll->events };
+ struct poll_table_struct pt = { ._key = req->cflags };
- req->result = vfs_poll(req->file, &pt) & poll->events;
+ req->result = vfs_poll(req->file, &pt) & req->cflags;
}
/* multishot, just fill an CQE and proceed */
- if (req->result && !(poll->events & EPOLLONESHOT)) {
+ if (req->result && !(req->cflags & EPOLLONESHOT)) {
__poll_t mask = mangle_poll(req->result & poll->events);
bool filled;
@@ -5545,6 +5943,7 @@ static int io_poll_check_events(struct io_kiocb *req)
if (unlikely(!filled))
return -ECANCELED;
io_cqring_ev_posted(ctx);
+ io_add_napi(req->file, ctx);
} else if (req->result) {
return 0;
}
@@ -5603,29 +6002,36 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
io_req_complete_failed(req, ret);
}
-static void __io_poll_execute(struct io_kiocb *req, int mask)
+static void __io_poll_execute(struct io_kiocb *req, int mask, int events)
{
req->result = mask;
+ /*
+ * This is useful for poll that is armed on behalf of another
+ * request, and where the wakeup path could be on a different
+ * CPU. We want to avoid pulling in req->apoll->events for that
+ * case.
+ */
+ req->cflags = events;
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
req->io_task_work.func = io_apoll_task_func;
- trace_io_uring_task_add(req->ctx, req->opcode, req->user_data, mask);
+ trace_io_uring_task_add(req->ctx, req, req->user_data, req->opcode, mask);
io_req_task_work_add(req, false);
}
-static inline void io_poll_execute(struct io_kiocb *req, int res)
+static inline void io_poll_execute(struct io_kiocb *req, int res, int events)
{
if (io_poll_get_ownership(req))
- __io_poll_execute(req, res);
+ __io_poll_execute(req, res, events);
}
static void io_poll_cancel_req(struct io_kiocb *req)
{
io_poll_mark_cancelled(req);
/* kick tw, which should complete the request */
- io_poll_execute(req, 0);
+ io_poll_execute(req, 0, 0);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -5639,7 +6045,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (unlikely(mask & POLLFREE)) {
io_poll_mark_cancelled(req);
/* we have to kick tw in case it's not already */
- io_poll_execute(req, 0);
+ io_poll_execute(req, 0, poll->events);
/*
* If the waitqueue is being freed early but someone is already
@@ -5669,8 +6075,9 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
if (mask && poll->events & EPOLLONESHOT) {
list_del_init(&poll->wait.entry);
poll->head = NULL;
+ req->flags &= ~REQ_F_SINGLE_POLL;
}
- __io_poll_execute(req, mask);
+ __io_poll_execute(req, mask, poll->events);
}
return 1;
}
@@ -5705,12 +6112,14 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
pt->error = -ENOMEM;
return;
}
+ req->flags |= REQ_F_DOUBLE_POLL;
io_init_poll_iocb(poll, first->events, first->wait.func);
*poll_ptr = poll;
if (req->opcode == IORING_OP_POLL_ADD)
req->flags |= REQ_F_ASYNC_DATA;
}
+ req->flags |= REQ_F_SINGLE_POLL;
pt->nr_entries++;
poll->head = head;
poll->wait.private = req;
@@ -5774,9 +6183,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
/* can't multishot if failed, just queue the event we've got */
if (unlikely(ipt->error || !ipt->nr_entries))
poll->events |= EPOLLONESHOT;
- __io_poll_execute(req, mask);
+ __io_poll_execute(req, mask, poll->events);
return 0;
}
+ io_add_napi(req->file, req->ctx);
/*
* Release ownership. If someone tried to queue a tw while it was
@@ -5784,7 +6194,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
*/
v = atomic_dec_return(&req->poll_refs);
if (unlikely(v & IO_POLL_REF_MASK))
- __io_poll_execute(req, 0);
+ __io_poll_execute(req, 0, poll->events);
return 0;
}
@@ -5803,7 +6213,7 @@ enum {
IO_APOLL_READY
};
-static int io_arm_poll_handler(struct io_kiocb *req)
+static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
struct io_ring_ctx *ctx = req->ctx;
@@ -5828,9 +6238,16 @@ static int io_arm_poll_handler(struct io_kiocb *req)
mask |= POLLOUT | POLLWRNORM;
}
- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
- if (unlikely(!apoll))
- return IO_APOLL_ABORTED;
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ !list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del_init(&apoll->poll.wait.entry);
+ } else {
+ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
+ if (unlikely(!apoll))
+ return IO_APOLL_ABORTED;
+ }
apoll->double_poll = NULL;
req->apoll = apoll;
req->flags |= REQ_F_POLLED;
@@ -5840,7 +6257,7 @@ static int io_arm_poll_handler(struct io_kiocb *req)
if (ret || ipt.error)
return ret ? IO_APOLL_READY : IO_APOLL_ABORTED;
- trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_poll_arm(ctx, req, req->user_data, req->opcode,
mask, apoll->poll.events);
return IO_APOLL_OK;
}
@@ -5975,7 +6392,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
return -EINVAL;
io_req_set_refcount(req);
- poll->events = io_poll_parse_events(sqe, flags);
+ req->cflags = poll->events = io_poll_parse_events(sqe, flags);
return 0;
}
@@ -6092,10 +6509,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (IS_ERR(req))
return PTR_ERR(req);
-
- req_set_fail(req);
- io_fill_cqe_req(req, -ECANCELED, 0);
- io_put_req_deferred(req);
+ io_req_task_queue_fail(req, -ECANCELED);
return 0;
}
@@ -6568,6 +6982,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return io_symlinkat_prep(req, sqe);
case IORING_OP_LINKAT:
return io_linkat_prep(req, sqe);
+ case IORING_OP_MSG_RING:
+ return io_msg_ring_prep(req, sqe);
}
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6649,7 +7065,7 @@ fail:
goto queue;
}
- trace_io_uring_defer(ctx, req, req->user_data);
+ trace_io_uring_defer(ctx, req, req->user_data, req->opcode);
de->req = req;
de->seq = seq;
list_add_tail(&de->list, &ctx->defer_list);
@@ -6659,7 +7075,7 @@ fail:
static void io_clean_op(struct io_kiocb *req)
{
if (req->flags & REQ_F_BUFFER_SELECTED)
- io_put_kbuf(req);
+ io_put_kbuf_comp(req);
if (req->flags & REQ_F_NEED_CLEANUP) {
switch (req->opcode) {
@@ -6709,6 +7125,10 @@ static void io_clean_op(struct io_kiocb *req)
putname(req->hardlink.oldpath);
putname(req->hardlink.newpath);
break;
+ case IORING_OP_STATX:
+ if (req->statx.filename)
+ putname(req->statx.filename);
+ break;
}
}
if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6851,6 +7271,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
case IORING_OP_LINKAT:
ret = io_linkat(req, issue_flags);
break;
+ case IORING_OP_MSG_RING:
+ ret = io_msg_ring(req, issue_flags);
+ break;
default:
ret = -EINVAL;
break;
@@ -6926,7 +7349,7 @@ static void io_wq_submit_work(struct io_wq_work *work)
continue;
}
- if (io_arm_poll_handler(req) == IO_APOLL_OK)
+ if (io_arm_poll_handler(req, issue_flags) == IO_APOLL_OK)
return;
/* aborted or ready, in either case retry blocking */
needs_poll = false;
@@ -6983,7 +7406,7 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
{
struct file *file = fget(fd);
- trace_io_uring_file_get(ctx, fd);
+ trace_io_uring_file_get(ctx, req, req->user_data, fd);
/* we don't allow fixed io_uring files */
if (file && unlikely(file->f_op == &io_uring_fops))
@@ -7072,7 +7495,7 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
{
struct io_kiocb *linked_timeout = io_prep_linked_timeout(req);
- switch (io_arm_poll_handler(req)) {
+ switch (io_arm_poll_handler(req, 0)) {
case IO_APOLL_READY:
io_req_task_queue(req);
break;
@@ -7081,8 +7504,12 @@ static void io_queue_sqe_arm_apoll(struct io_kiocb *req)
* Queued up for async execution, worker will release
* submit reference when the iocb is actually submitted.
*/
+ io_kbuf_recycle(req);
io_queue_async_work(req, NULL);
break;
+ case IO_APOLL_OK:
+ io_kbuf_recycle(req);
+ break;
}
if (linked_timeout)
@@ -7281,7 +7708,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
ret = io_init_req(ctx, req, sqe);
if (unlikely(ret)) {
- trace_io_uring_req_failed(sqe, ret);
+ trace_io_uring_req_failed(sqe, ctx, req, ret);
/* fail even hard links since we don't submit */
if (link->head) {
@@ -7308,7 +7735,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
}
/* don't need @sqe from now on */
- trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data,
+ trace_io_uring_submit_sqe(ctx, req, req->user_data, req->opcode,
req->flags, true,
ctx->flags & IORING_SETUP_SQPOLL);
@@ -7451,8 +7878,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
}
/* will complete beyond this point, count as submitted */
submitted++;
- if (io_submit_sqe(ctx, req, sqe))
- break;
+ if (io_submit_sqe(ctx, req, sqe)) {
+ /*
+ * Continue submitting even for sqe failure if the
+ * ring was setup with IORING_SETUP_SUBMIT_ALL
+ */
+ if (!(ctx->flags & IORING_SETUP_SUBMIT_ALL))
+ break;
+ }
} while (submitted < nr);
if (unlikely(submitted != nr)) {
@@ -7519,7 +7952,13 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
!(ctx->flags & IORING_SETUP_R_DISABLED))
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
-
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ spin_lock(&ctx->napi_lock);
+ if (!list_empty(&ctx->napi_list) &&
+ io_napi_busy_loop(&ctx->napi_list))
+ ++ret;
+ spin_unlock(&ctx->napi_lock);
+#endif
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait);
if (creds)
@@ -7650,6 +8089,9 @@ struct io_wait_queue {
struct io_ring_ctx *ctx;
unsigned cq_tail;
unsigned nr_timeouts;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ unsigned busy_poll_to;
+#endif
};
static inline bool io_should_wake(struct io_wait_queue *iowq)
@@ -7684,17 +8126,17 @@ static int io_run_task_work_sig(void)
{
if (io_run_task_work())
return 1;
- if (!signal_pending(current))
- return 0;
if (test_thread_flag(TIF_NOTIFY_SIGNAL))
return -ERESTARTSYS;
- return -EINTR;
+ if (task_sigpending(current))
+ return -EINTR;
+ return 0;
}
/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
- signed long *timeout)
+ ktime_t timeout)
{
int ret;
@@ -7706,10 +8148,92 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
if (test_bit(0, &ctx->check_cq_overflow))
return 1;
- *timeout = schedule_timeout(*timeout);
- return !*timeout ? -ETIME : 1;
+ if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
+ return -ETIME;
+ return 1;
+}
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static void io_adjust_busy_loop_timeout(struct timespec64 *ts,
+ struct io_wait_queue *iowq)
+{
+ unsigned busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+ struct timespec64 pollto = ns_to_timespec64(1000 * (s64)busy_poll_to);
+
+ if (timespec64_compare(ts, &pollto) > 0) {
+ *ts = timespec64_sub(*ts, pollto);
+ iowq->busy_poll_to = busy_poll_to;
+ } else {
+ u64 to = timespec64_to_ns(ts);
+
+ do_div(to, 1000);
+ iowq->busy_poll_to = to;
+ ts->tv_sec = 0;
+ ts->tv_nsec = 0;
+ }
}
+static inline bool io_busy_loop_timeout(unsigned long start_time,
+ unsigned long bp_usec)
+{
+ if (bp_usec) {
+ unsigned long end_time = start_time + bp_usec;
+ unsigned long now = busy_loop_current_time();
+
+ return time_after(now, end_time);
+ }
+ return true;
+}
+
+static bool io_busy_loop_end(void *p, unsigned long start_time)
+{
+ struct io_wait_queue *iowq = p;
+
+ return signal_pending(current) ||
+ io_should_wake(iowq) ||
+ io_busy_loop_timeout(start_time, iowq->busy_poll_to);
+}
+
+static void io_blocking_napi_busy_loop(struct list_head *napi_list,
+ struct io_wait_queue *iowq)
+{
+ unsigned long start_time =
+ list_is_singular(napi_list) ? 0 :
+ busy_loop_current_time();
+
+ do {
+ if (list_is_singular(napi_list)) {
+ struct napi_entry *ne =
+ list_first_entry(napi_list,
+ struct napi_entry, list);
+
+ napi_busy_loop(ne->napi_id, io_busy_loop_end, iowq,
+ true, BUSY_POLL_BUDGET);
+ io_check_napi_entry_timeout(ne);
+ break;
+ }
+ } while (io_napi_busy_loop(napi_list) &&
+ !io_busy_loop_end(iowq, start_time));
+}
+
+static void io_putback_napi_list(struct io_ring_ctx *ctx,
+ struct list_head *napi_list)
+{
+ struct napi_entry *cne, *lne;
+
+ spin_lock(&ctx->napi_lock);
+ list_for_each_entry(cne, &ctx->napi_list, list)
+ list_for_each_entry(lne, napi_list, list)
+ if (cne->napi_id == lne->napi_id) {
+ list_del(&lne->list);
+ kfree(lne);
+ break;
+ }
+ list_splice(napi_list, &ctx->napi_list);
+ spin_unlock(&ctx->napi_lock);
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
/*
* Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring.
@@ -7720,8 +8244,11 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
{
struct io_wait_queue iowq;
struct io_rings *rings = ctx->rings;
- signed long timeout = MAX_SCHEDULE_TIMEOUT;
+ ktime_t timeout = KTIME_MAX;
int ret;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ LIST_HEAD(local_napi_list);
+#endif
do {
io_cqring_overflow_flush(ctx);
@@ -7731,14 +8258,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
break;
} while (1);
- if (uts) {
- struct timespec64 ts;
-
- if (get_timespec64(&ts, uts))
- return -EFAULT;
- timeout = timespec64_to_jiffies(&ts);
- }
-
if (sig) {
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
@@ -7752,6 +8271,30 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret;
}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ iowq.busy_poll_to = 0;
+ if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
+ spin_lock(&ctx->napi_lock);
+ list_splice_init(&ctx->napi_list, &local_napi_list);
+ spin_unlock(&ctx->napi_lock);
+ }
+#endif
+ if (uts) {
+ struct timespec64 ts;
+
+ if (get_timespec64(&ts, uts))
+ return -EFAULT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (!list_empty(&local_napi_list))
+ io_adjust_busy_loop_timeout(&ts, &iowq);
+#endif
+ timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns());
+ }
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ else if (!list_empty(&local_napi_list))
+ iowq.busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
+#endif
+
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
INIT_LIST_HEAD(&iowq.wq.entry);
@@ -7760,6 +8303,12 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
trace_io_uring_cqring_wait(ctx, min_events);
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ if (iowq.busy_poll_to)
+ io_blocking_napi_busy_loop(&local_napi_list, &iowq);
+ if (!list_empty(&local_napi_list))
+ io_putback_napi_list(ctx, &local_napi_list);
+#endif
do {
/* if we can't even flush overflow, don't wait for more */
if (!io_cqring_overflow_flush(ctx)) {
@@ -7768,7 +8317,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
}
prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq,
TASK_INTERRUPTIBLE);
- ret = io_cqring_wait_schedule(ctx, &iowq, &timeout);
+ ret = io_cqring_wait_schedule(ctx, &iowq, timeout);
finish_wait(&ctx->cq_wait, &iowq.wq);
cond_resched();
} while (ret > 0);
@@ -7925,7 +8474,15 @@ static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
- break;
+ if (atomic_read(&data->refs) > 0) {
+ /*
+ * it has been revived by another thread while
+ * we were unlocked
+ */
+ mutex_unlock(&ctx->uring_lock);
+ } else {
+ break;
+ }
}
atomic_inc(&data->refs);
@@ -8740,8 +9297,16 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (unlikely(!tctx))
return -ENOMEM;
+ tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX,
+ sizeof(struct file *), GFP_KERNEL);
+ if (unlikely(!tctx->registered_rings)) {
+ kfree(tctx);
+ return -ENOMEM;
+ }
+
ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL);
if (unlikely(ret)) {
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8750,6 +9315,7 @@ static __cold int io_uring_alloc_task_context(struct task_struct *task,
if (IS_ERR(tctx->io_wq)) {
ret = PTR_ERR(tctx->io_wq);
percpu_counter_destroy(&tctx->inflight);
+ kfree(tctx->registered_rings);
kfree(tctx);
return ret;
}
@@ -8774,6 +9340,7 @@ void __io_uring_free(struct task_struct *tsk)
WARN_ON_ONCE(tctx->io_wq);
WARN_ON_ONCE(tctx->cached_refs);
+ kfree(tctx->registered_rings);
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
tsk->io_uring = NULL;
@@ -8933,10 +9500,9 @@ static void io_mem_free(void *ptr)
static void *io_mem_alloc(size_t size)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP |
- __GFP_NORETRY | __GFP_ACCOUNT;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
- return (void *) __get_free_pages(gfp_flags, get_order(size));
+ return (void *) __get_free_pages(gfp, get_order(size));
}
static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
@@ -9351,33 +9917,55 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
return done ? done : err;
}
-static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg)
+static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
+ unsigned int eventfd_async)
{
+ struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
- if (ctx->cq_ev_fd)
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
- ctx->cq_ev_fd = eventfd_ctx_fdget(fd);
- if (IS_ERR(ctx->cq_ev_fd)) {
- int ret = PTR_ERR(ctx->cq_ev_fd);
+ ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
+ if (!ev_fd)
+ return -ENOMEM;
- ctx->cq_ev_fd = NULL;
+ ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
+ if (IS_ERR(ev_fd->cq_ev_fd)) {
+ int ret = PTR_ERR(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
return ret;
}
-
+ ev_fd->eventfd_async = eventfd_async;
+ ctx->has_evfd = true;
+ rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
+static void io_eventfd_put(struct rcu_head *rcu)
+{
+ struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
+
+ eventfd_ctx_put(ev_fd->cq_ev_fd);
+ kfree(ev_fd);
+}
+
static int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
- if (ctx->cq_ev_fd) {
- eventfd_ctx_put(ctx->cq_ev_fd);
- ctx->cq_ev_fd = NULL;
+ struct io_ev_fd *ev_fd;
+
+ ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
+ lockdep_is_held(&ctx->uring_lock));
+ if (ev_fd) {
+ ctx->has_evfd = false;
+ rcu_assign_pointer(ctx->io_ev_fd, NULL);
+ call_rcu(&ev_fd->rcu, io_eventfd_put);
return 0;
}
@@ -9386,11 +9974,28 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx)
static void io_destroy_buffers(struct io_ring_ctx *ctx)
{
- struct io_buffer *buf;
- unsigned long index;
+ int i;
+
+ for (i = 0; i < (1U << IO_BUFFERS_HASH_BITS); i++) {
+ struct list_head *list = &ctx->io_buffers[i];
+
+ while (!list_empty(list)) {
+ struct io_buffer_list *bl;
+
+ bl = list_first_entry(list, struct io_buffer_list, list);
+ __io_remove_buffers(ctx, bl, -1U);
+ list_del(&bl->list);
+ kfree(bl);
+ }
+ }
+
+ while (!list_empty(&ctx->io_buffers_pages)) {
+ struct page *page;
- xa_for_each(&ctx->io_buffers, index, buf)
- __io_remove_buffers(ctx, buf, index, -1U);
+ page = list_first_entry(&ctx->io_buffers_pages, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
}
static void io_req_caches_free(struct io_ring_ctx *ctx)
@@ -9421,6 +10026,18 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data)
wait_for_completion(&data->done);
}
+static void io_flush_apoll_cache(struct io_ring_ctx *ctx)
+{
+ struct async_poll *apoll;
+
+ while (!list_empty(&ctx->apoll_cache)) {
+ apoll = list_first_entry(&ctx->apoll_cache, struct async_poll,
+ poll.wait.entry);
+ list_del(&apoll->poll.wait.entry);
+ kfree(apoll);
+ }
+}
+
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{
io_sq_thread_finish(ctx);
@@ -9442,8 +10059,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
__io_sqe_files_unregister(ctx);
if (ctx->rings)
__io_cqring_overflow_flush(ctx, true);
- mutex_unlock(&ctx->uring_lock);
io_eventfd_unregister(ctx);
+ io_flush_apoll_cache(ctx);
+ mutex_unlock(&ctx->uring_lock);
io_destroy_buffers(ctx);
if (ctx->sq_creds)
put_cred(ctx->sq_creds);
@@ -9475,8 +10093,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_req_caches_free(ctx);
if (ctx->hash_map)
io_wq_put_hash(ctx->hash_map);
+ io_free_napi_list(ctx);
kfree(ctx->cancel_hash);
kfree(ctx->dummy_ubuf);
+ kfree(ctx->io_buffers);
kfree(ctx);
}
@@ -9975,6 +10595,139 @@ void __io_uring_cancel(bool cancel_all)
io_uring_cancel_generic(cancel_all, NULL);
}
+void io_uring_unreg_ringfd(void)
+{
+ struct io_uring_task *tctx = current->io_uring;
+ int i;
+
+ for (i = 0; i < IO_RINGFD_REG_MAX; i++) {
+ if (tctx->registered_rings[i]) {
+ fput(tctx->registered_rings[i]);
+ tctx->registered_rings[i] = NULL;
+ }
+ }
+}
+
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+ int start, int end)
+{
+ struct file *file;
+ int offset;
+
+ for (offset = start; offset < end; offset++) {
+ offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[offset])
+ continue;
+
+ file = fget(fd);
+ if (!file) {
+ return -EBADF;
+ } else if (file->f_op != &io_uring_fops) {
+ fput(file);
+ return -EOPNOTSUPP;
+ }
+ tctx->registered_rings[offset] = file;
+ return offset;
+ }
+
+ return -EBUSY;
+}
+
+/*
+ * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
+ * invocation. User passes in an array of struct io_uring_rsrc_update
+ * with ->data set to the ring_fd, and ->offset given for the desired
+ * index. If no index is desired, application may set ->offset == -1U
+ * and we'll find an available index. Returns number of entries
+ * successfully processed, or < 0 on error if none were processed.
+ */
+static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_rsrc_update reg;
+ struct io_uring_task *tctx;
+ int ret, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+
+ mutex_unlock(&ctx->uring_lock);
+ ret = io_uring_add_tctx_node(ctx);
+ mutex_lock(&ctx->uring_lock);
+ if (ret)
+ return ret;
+
+ tctx = current->io_uring;
+ for (i = 0; i < nr_args; i++) {
+ int start, end;
+
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+
+ if (reg.offset == -1U) {
+ start = 0;
+ end = IO_RINGFD_REG_MAX;
+ } else {
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+ start = reg.offset;
+ end = start + 1;
+ }
+
+ ret = io_ring_add_registered_fd(tctx, reg.data, start, end);
+ if (ret < 0)
+ break;
+
+ reg.offset = ret;
+ if (copy_to_user(&arg[i], &reg, sizeof(reg))) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ return i ? i : ret;
+}
+
+static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
+ unsigned nr_args)
+{
+ struct io_uring_rsrc_update __user *arg = __arg;
+ struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_rsrc_update reg;
+ int ret = 0, i;
+
+ if (!nr_args || nr_args > IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ if (!tctx)
+ return 0;
+
+ for (i = 0; i < nr_args; i++) {
+ if (copy_from_user(&reg, &arg[i], sizeof(reg))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (reg.offset >= IO_RINGFD_REG_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX);
+ if (tctx->registered_rings[reg.offset]) {
+ fput(tctx->registered_rings[reg.offset]);
+ tctx->registered_rings[reg.offset] = NULL;
+ }
+ }
+
+ return i ? i : ret;
+}
+
static void *io_uring_validate_mmap_request(struct file *file,
loff_t pgoff, size_t sz)
{
@@ -10105,12 +10858,28 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
io_run_task_work();
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
- IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG)))
+ IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
+ IORING_ENTER_REGISTERED_RING)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!f.file))
- return -EBADF;
+ /*
+ * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
+ * need only dereference our task private array to find it.
+ */
+ if (flags & IORING_ENTER_REGISTERED_RING) {
+ struct io_uring_task *tctx = current->io_uring;
+
+ if (!tctx || fd >= IO_RINGFD_REG_MAX)
+ return -EINVAL;
+ fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
+ f.file = tctx->registered_rings[fd];
+ if (unlikely(!f.file))
+ return -EBADF;
+ } else {
+ f = fdget(fd);
+ if (unlikely(!f.file))
+ return -EBADF;
+ }
ret = -EOPNOTSUPP;
if (unlikely(f.file->f_op != &io_uring_fops))
@@ -10184,7 +10953,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
out:
percpu_ref_put(&ctx->refs);
out_fput:
- fdput(f);
+ if (!(flags & IORING_ENTER_REGISTERED_RING))
+ fdput(f);
return submitted ? submitted : ret;
}
@@ -10602,7 +11372,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL))
return -EINVAL;
return io_uring_create(entries, &p, params);
@@ -10952,61 +11722,6 @@ err:
return ret;
}
-static bool io_register_op_must_quiesce(int op)
-{
- switch (op) {
- case IORING_REGISTER_BUFFERS:
- case IORING_UNREGISTER_BUFFERS:
- case IORING_REGISTER_FILES:
- case IORING_UNREGISTER_FILES:
- case IORING_REGISTER_FILES_UPDATE:
- case IORING_REGISTER_PROBE:
- case IORING_REGISTER_PERSONALITY:
- case IORING_UNREGISTER_PERSONALITY:
- case IORING_REGISTER_FILES2:
- case IORING_REGISTER_FILES_UPDATE2:
- case IORING_REGISTER_BUFFERS2:
- case IORING_REGISTER_BUFFERS_UPDATE:
- case IORING_REGISTER_IOWQ_AFF:
- case IORING_UNREGISTER_IOWQ_AFF:
- case IORING_REGISTER_IOWQ_MAX_WORKERS:
- return false;
- default:
- return true;
- }
-}
-
-static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx)
-{
- long ret;
-
- percpu_ref_kill(&ctx->refs);
-
- /*
- * Drop uring mutex before waiting for references to exit. If another
- * thread is currently inside io_uring_enter() it might need to grab the
- * uring_lock to make progress. If we hold it here across the drain
- * wait, then we can deadlock. It's safe to drop the mutex here, since
- * no new references will come in after we've killed the percpu ref.
- */
- mutex_unlock(&ctx->uring_lock);
- do {
- ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ);
- if (ret) {
- ret = min(0L, ret);
- break;
- }
-
- ret = io_run_task_work_sig();
- io_req_caches_free(ctx);
- } while (ret >= 0);
- mutex_lock(&ctx->uring_lock);
-
- if (ret)
- io_refs_resurrect(&ctx->refs, &ctx->ref_comp);
- return ret;
-}
-
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -11030,12 +11745,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return -EACCES;
}
- if (io_register_op_must_quiesce(opcode)) {
- ret = io_ctx_quiesce(ctx);
- if (ret)
- return ret;
- }
-
switch (opcode) {
case IORING_REGISTER_BUFFERS:
ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
@@ -11059,17 +11768,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_register_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
- case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
- ret = io_eventfd_register(ctx, arg);
- if (ret)
+ ret = io_eventfd_register(ctx, arg, 0);
+ break;
+ case IORING_REGISTER_EVENTFD_ASYNC:
+ ret = -EINVAL;
+ if (nr_args != 1)
break;
- if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
- ctx->eventfd_async = 1;
- else
- ctx->eventfd_async = 0;
+ ret = io_eventfd_register(ctx, arg, 1);
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
@@ -11136,16 +11844,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_iowq_max_workers(ctx, arg);
break;
+ case IORING_REGISTER_RING_FDS:
+ ret = io_ringfd_register(ctx, arg, nr_args);
+ break;
+ case IORING_UNREGISTER_RING_FDS:
+ ret = io_ringfd_unregister(ctx, arg, nr_args);
+ break;
default:
ret = -EINVAL;
break;
}
- if (io_register_op_must_quiesce(opcode)) {
- /* bring the ctx back to life */
- percpu_ref_reinit(&ctx->refs);
- reinit_completion(&ctx->ref_comp);
- }
return ret;
}
@@ -11171,8 +11880,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock);
- trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs,
- ctx->cq_ev_fd != NULL, ret);
+ trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
fdput(f);
return ret;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 1ed097e94af2..090bf47606ab 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -236,9 +236,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
if (!src_file.file)
return -EBADF;
- ret = -EXDEV;
- if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
- goto fdput;
cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff,
olen, 0);
if (cloned < 0)
@@ -247,7 +244,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EINVAL;
else
ret = 0;
-fdput:
fdput(src_file);
return ret;
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 491534e90861..d5fbd860c8cd 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -21,6 +21,8 @@
#include "../internal.h"
+#define IOEND_BATCH_SIZE 4096
+
/*
* Structure allocated for each folio when block size < folio size
* to track sub-folio uptodate status and I/O completions.
@@ -423,37 +425,33 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
EXPORT_SYMBOL_GPL(iomap_readahead);
/*
- * iomap_is_partially_uptodate checks whether blocks within a page are
+ * iomap_is_partially_uptodate checks whether blocks within a folio are
* uptodate or not.
*
- * Returns true if all blocks which correspond to a file portion
- * we want to read within the page are uptodate.
+ * Returns true if all blocks which correspond to the specified part
+ * of the folio are uptodate.
*/
-int
-iomap_is_partially_uptodate(struct page *page, unsigned long from,
- unsigned long count)
+bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
- struct folio *folio = page_folio(page);
struct iomap_page *iop = to_iomap_page(folio);
- struct inode *inode = page->mapping->host;
- unsigned len, first, last;
- unsigned i;
+ struct inode *inode = folio->mapping->host;
+ size_t len;
+ unsigned first, last, i;
- /* Limit range to one page */
- len = min_t(unsigned, PAGE_SIZE - from, count);
+ if (!iop)
+ return false;
+
+ /* Limit range to this folio */
+ len = min(folio_size(folio) - from, count);
/* First and last blocks in range within page */
first = from >> inode->i_blkbits;
last = (from + len - 1) >> inode->i_blkbits;
- if (iop) {
- for (i = first; i <= last; i++)
- if (!test_bit(i, iop->uptodate))
- return 0;
- return 1;
- }
-
- return 0;
+ for (i = first; i <= last; i++)
+ if (!test_bit(i, iop->uptodate))
+ return false;
+ return true;
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
@@ -479,7 +477,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
{
- trace_iomap_invalidatepage(folio->mapping->host, offset, len);
+ trace_iomap_invalidate_folio(folio->mapping->host,
+ folio_pos(folio) + offset, len);
/*
* If we're invalidating the entire folio, clear the dirty state
@@ -498,13 +497,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
}
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
-void iomap_invalidatepage(struct page *page, unsigned int offset,
- unsigned int len)
-{
- iomap_invalidate_folio(page_folio(page), offset, len);
-}
-EXPORT_SYMBOL_GPL(iomap_invalidatepage);
-
#ifdef CONFIG_MIGRATION
int
iomap_migrate_page(struct address_space *mapping, struct page *newpage,
@@ -1038,7 +1030,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
-static void
+static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
struct inode *inode = ioend->io_inode;
@@ -1047,6 +1039,7 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
u64 start = bio->bi_iter.bi_sector;
loff_t offset = ioend->io_offset;
bool quiet = bio_flagged(bio, BIO_QUIET);
+ u32 folio_count = 0;
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct folio_iter fi;
@@ -1061,9 +1054,11 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
next = bio->bi_private;
/* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio)
+ bio_for_each_folio_all(fi, bio) {
iomap_finish_folio_write(inode, fi.folio, fi.length,
error);
+ folio_count++;
+ }
bio_put(bio);
}
/* The ioend has been freed by bio_put() */
@@ -1073,20 +1068,36 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
"%s: writeback error on inode %lu, offset %lld, sector %llu",
inode->i_sb->s_id, inode->i_ino, offset, start);
}
+ return folio_count;
}
+/*
+ * Ioend completion routine for merged bios. This can only be called from task
+ * contexts as merged ioends can be of unbound length. Hence we have to break up
+ * the writeback completions into manageable chunks to avoid long scheduler
+ * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
+ * good batch processing throughput without creating adverse scheduler latency
+ * conditions.
+ */
void
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
struct list_head tmp;
+ u32 completions;
+
+ might_sleep();
list_replace_init(&ioend->io_list, &tmp);
- iomap_finish_ioend(ioend, error);
+ completions = iomap_finish_ioend(ioend, error);
while (!list_empty(&tmp)) {
+ if (completions > IOEND_BATCH_SIZE * 8) {
+ cond_resched();
+ completions = 0;
+ }
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
list_del_init(&ioend->io_list);
- iomap_finish_ioend(ioend, error);
+ completions += iomap_finish_ioend(ioend, error);
}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
@@ -1107,6 +1118,18 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
+ /*
+ * Do not merge physically discontiguous ioends. The filesystem
+ * completion functions will have to iterate the physical
+ * discontiguities even if we merge the ioends at a logical level, so
+ * we don't gain anything by merging physical discontiguities here.
+ *
+ * We cannot use bio->bi_iter.bi_sector here as it is modified during
+ * submission so does not point to the start sector of the bio at
+ * completion.
+ */
+ if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
+ return false;
return true;
}
@@ -1199,7 +1222,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
REQ_OP_WRITE | wbc_to_write_flags(wbc),
GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = sector;
- bio->bi_write_hint = inode->i_write_hint;
wbc_init_bio(wbc, bio);
ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
@@ -1208,8 +1230,10 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
ioend->io_flags = wpc->iomap.flags;
ioend->io_inode = inode;
ioend->io_size = 0;
+ ioend->io_folios = 0;
ioend->io_offset = offset;
ioend->io_bio = bio;
+ ioend->io_sector = sector;
return ioend;
}
@@ -1228,7 +1252,6 @@ iomap_chain_bio(struct bio *prev)
new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
bio_clone_blkg_association(new, prev);
new->bi_iter.bi_sector = bio_end_sector(prev);
- new->bi_write_hint = prev->bi_write_hint;
bio_chain(prev, new);
bio_get(prev); /* for iomap_finish_ioend */
@@ -1249,6 +1272,13 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
return false;
if (sector != bio_end_sector(wpc->ioend->io_bio))
return false;
+ /*
+ * Limit ioend bio chain lengths to minimise IO completion latency. This
+ * also prevents long tight loops ending page writeback on all the
+ * folios in the ioend.
+ */
+ if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+ return false;
return true;
}
@@ -1333,6 +1363,8 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
&submit_list);
count++;
}
+ if (count)
+ wpc->ioend->io_folios++;
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
WARN_ON_ONCE(!folio_test_locked(folio));
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index e2ba13645ef2..b08f5dc31780 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
+#include <linux/fscrypt.h>
#include <linux/pagemap.h>
#include <linux/iomap.h>
#include <linux/backing-dev.h>
@@ -179,11 +180,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
+ struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
@@ -308,8 +312,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
+ fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
+ GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
- bio->bi_write_hint = dio->iocb->ki_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 66cf267c68ae..610ca6f1ec9b 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/fiemap.h>
+#include <linux/pagemap.h>
static int iomap_to_fiemap(struct fiemap_extent_info *fi,
const struct iomap *iomap, u32 flags)
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 65e39785c284..a6689a563c6e 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -81,7 +81,7 @@ DEFINE_EVENT(iomap_range_class, name, \
TP_ARGS(inode, off, len))
DEFINE_RANGE_EVENT(iomap_writepage);
DEFINE_RANGE_EVENT(iomap_releasepage);
-DEFINE_RANGE_EVENT(iomap_invalidatepage);
+DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
#define IOMAP_TYPE_STRINGS \
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 0c6eacfcbeef..d7491692aea3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -70,7 +70,7 @@ static struct kmem_cache *isofs_inode_cachep;
static struct inode *isofs_alloc_inode(struct super_block *sb)
{
struct iso_inode_info *ei;
- ei = kmem_cache_alloc(isofs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, isofs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 3cc4ab2ba7f4..5b9408e3b370 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -484,22 +484,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
stats.run.rs_locked);
- spin_lock(&commit_transaction->t_handle_lock);
- while (atomic_read(&commit_transaction->t_updates)) {
- DEFINE_WAIT(wait);
+ // waits for any t_updates to finish
+ jbd2_journal_wait_updates(journal);
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&commit_transaction->t_updates)) {
- spin_unlock(&commit_transaction->t_handle_lock);
- write_unlock(&journal->j_state_lock);
- schedule();
- write_lock(&journal->j_state_lock);
- spin_lock(&commit_transaction->t_handle_lock);
- }
- finish_wait(&journal->j_wait_updates, &wait);
- }
- spin_unlock(&commit_transaction->t_handle_lock);
commit_transaction->t_state = T_SWITCH;
write_unlock(&journal->j_state_lock);
@@ -817,7 +804,7 @@ start_journal_io:
commit_transaction->t_state = T_COMMIT_DFLUSH;
write_unlock(&journal->j_state_lock);
- /*
+ /*
* If the journal is not located on the file system device,
* then we must flush the file system device before we issue
* the commit record
@@ -1170,7 +1157,7 @@ restart_loop:
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
if (journal->j_fc_cleanup_callback)
- journal->j_fc_cleanup_callback(journal, 1);
+ journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index bf108d4493d2..fcacafa4510d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -86,7 +86,7 @@ EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
-EXPORT_SYMBOL(jbd2_journal_invalidatepage);
+EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
@@ -771,7 +771,7 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
jbd2_journal_unlock_updates(journal);
if (journal->j_fc_cleanup_callback)
- journal->j_fc_cleanup_callback(journal, 0);
+ journal->j_fc_cleanup_callback(journal, 0, tid);
write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
if (fallback)
@@ -1287,6 +1287,8 @@ static int jbd2_min_tag_size(void)
/**
* jbd2_journal_shrink_scan()
+ * @shrink: shrinker to work on
+ * @sc: reclaim request to process
*
* Scan the checkpointed buffer on the checkpoint list and release the
* journal_head.
@@ -1312,6 +1314,8 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
/**
* jbd2_journal_shrink_count()
+ * @shrink: shrinker to work on
+ * @sc: reclaim request to process
*
* Count the number of checkpoint buffers on the checkpoint list.
*/
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6a3caedd2285..fcb9175016a5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -107,7 +107,6 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
- spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
jbd2_descriptor_blocks_per_trans(journal) +
@@ -139,26 +138,22 @@ static void jbd2_get_transaction(journal_t *journal,
/*
* Update transaction's maximum wait time, if debugging is enabled.
*
- * In order for t_max_wait to be reliable, it must be protected by a
- * lock. But doing so will mean that start_this_handle() can not be
- * run in parallel on SMP systems, which limits our scalability. So
- * unless debugging is enabled, we no longer update t_max_wait, which
- * means that maximum wait time reported by the jbd2_run_stats
- * tracepoint will always be zero.
+ * t_max_wait is carefully updated here with use of atomic compare exchange.
+ * Note that there could be multiplre threads trying to do this simultaneously
+ * hence using cmpxchg to avoid any use of locks in this case.
+ * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
*/
static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
{
-#ifdef CONFIG_JBD2_DEBUG
- if (jbd2_journal_enable_debug &&
- time_after(transaction->t_start, ts)) {
- ts = jbd2_time_diff(ts, transaction->t_start);
- spin_lock(&transaction->t_handle_lock);
- if (ts > transaction->t_max_wait)
- transaction->t_max_wait = ts;
- spin_unlock(&transaction->t_handle_lock);
+ unsigned long oldts, newts;
+
+ if (time_after(transaction->t_start, ts)) {
+ newts = jbd2_time_diff(ts, transaction->t_start);
+ oldts = READ_ONCE(transaction->t_max_wait);
+ while (oldts < newts)
+ oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
}
-#endif
}
/*
@@ -449,7 +444,7 @@ repeat:
}
/* OK, account for the buffers that this operation expects to
- * use and add the handle to the running transaction.
+ * use and add the handle to the running transaction.
*/
update_t_max_wait(transaction, ts);
handle->h_transaction = transaction;
@@ -690,7 +685,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
DIV_ROUND_UP(
handle->h_revoke_credits_requested,
journal->j_revoke_records_per_block);
- spin_lock(&transaction->t_handle_lock);
wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits);
@@ -698,7 +692,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks);
atomic_sub(nblocks, &transaction->t_outstanding_credits);
- goto unlock;
+ goto error_out;
}
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
@@ -714,8 +708,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
-unlock:
- spin_unlock(&transaction->t_handle_lock);
error_out:
read_unlock(&journal->j_state_lock);
return result;
@@ -836,6 +828,43 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
}
EXPORT_SYMBOL(jbd2_journal_restart);
+/*
+ * Waits for any outstanding t_updates to finish.
+ * This is called with write j_state_lock held.
+ */
+void jbd2_journal_wait_updates(journal_t *journal)
+{
+ DEFINE_WAIT(wait);
+
+ while (1) {
+ /*
+ * Note that the running transaction can get freed under us if
+ * this transaction is getting committed in
+ * jbd2_journal_commit_transaction() ->
+ * jbd2_journal_free_transaction(). This can only happen when we
+ * release j_state_lock -> schedule() -> acquire j_state_lock.
+ * Hence we should everytime retrieve new j_running_transaction
+ * value (after j_state_lock release acquire cycle), else it may
+ * lead to use-after-free of old freed transaction.
+ */
+ transaction_t *transaction = journal->j_running_transaction;
+
+ if (!transaction)
+ break;
+
+ prepare_to_wait(&journal->j_wait_updates, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (!atomic_read(&transaction->t_updates)) {
+ finish_wait(&journal->j_wait_updates, &wait);
+ break;
+ }
+ write_unlock(&journal->j_state_lock);
+ schedule();
+ finish_wait(&journal->j_wait_updates, &wait);
+ write_lock(&journal->j_state_lock);
+ }
+}
+
/**
* jbd2_journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
@@ -848,8 +877,6 @@ EXPORT_SYMBOL(jbd2_journal_restart);
*/
void jbd2_journal_lock_updates(journal_t *journal)
{
- DEFINE_WAIT(wait);
-
jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock);
@@ -863,27 +890,9 @@ void jbd2_journal_lock_updates(journal_t *journal)
write_lock(&journal->j_state_lock);
}
- /* Wait until there are no running updates */
- while (1) {
- transaction_t *transaction = journal->j_running_transaction;
-
- if (!transaction)
- break;
+ /* Wait until there are no running t_updates */
+ jbd2_journal_wait_updates(journal);
- spin_lock(&transaction->t_handle_lock);
- prepare_to_wait(&journal->j_wait_updates, &wait,
- TASK_UNINTERRUPTIBLE);
- if (!atomic_read(&transaction->t_updates)) {
- spin_unlock(&transaction->t_handle_lock);
- finish_wait(&journal->j_wait_updates, &wait);
- break;
- }
- spin_unlock(&transaction->t_handle_lock);
- write_unlock(&journal->j_state_lock);
- schedule();
- finish_wait(&journal->j_wait_updates, &wait);
- write_lock(&journal->j_state_lock);
- }
write_unlock(&journal->j_state_lock);
/*
@@ -2208,14 +2217,14 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
}
/*
- * jbd2_journal_invalidatepage
+ * jbd2_journal_invalidate_folio
*
* This code is tricky. It has a number of cases to deal with.
*
* There are two invariants which this code relies on:
*
- * i_size must be updated on disk before we start calling invalidatepage on the
- * data.
+ * i_size must be updated on disk before we start calling invalidate_folio
+ * on the data.
*
* This is done in ext3 by defining an ext3_setattr method which
* updates i_size before truncate gets going. By maintaining this
@@ -2417,9 +2426,9 @@ zap_buffer_unlocked:
}
/**
- * jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidate_folio()
* @journal: journal to use for flush...
- * @page: page to flush
+ * @folio: folio to flush
* @offset: start of the range to invalidate
* @length: length of the range to invalidate
*
@@ -2428,30 +2437,29 @@ zap_buffer_unlocked:
* the page is straddling i_size. Caller then has to wait for current commit
* and try again.
*/
-int jbd2_journal_invalidatepage(journal_t *journal,
- struct page *page,
- unsigned int offset,
- unsigned int length)
+int jbd2_journal_invalidate_folio(journal_t *journal, struct folio *folio,
+ size_t offset, size_t length)
{
struct buffer_head *head, *bh, *next;
unsigned int stop = offset + length;
unsigned int curr_off = 0;
- int partial_page = (offset || length < PAGE_SIZE);
+ int partial_page = (offset || length < folio_size(folio));
int may_free = 1;
int ret = 0;
- if (!PageLocked(page))
+ if (!folio_test_locked(folio))
BUG();
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
return 0;
- BUG_ON(stop > PAGE_SIZE || stop < length);
+ BUG_ON(stop > folio_size(folio) || stop < length);
/* We will potentially be playing with lists other than just the
* data lists (especially for journaled data mode), so be
* cautious in our locking. */
- head = bh = page_buffers(page);
+ bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
next = bh->b_this_page;
@@ -2474,8 +2482,8 @@ int jbd2_journal_invalidatepage(journal_t *journal,
} while (bh != head);
if (!partial_page) {
- if (may_free && try_to_free_buffers(page))
- J_ASSERT(!page_has_buffers(page));
+ if (may_free && try_to_free_buffers(&folio->page))
+ J_ASSERT(!folio_buffers(folio));
}
return 0;
}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 81ca58c10b72..7ea37f49f1e1 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -39,7 +39,7 @@ static struct inode *jffs2_alloc_inode(struct super_block *sb)
{
struct jffs2_inode_info *f;
- f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL);
+ f = alloc_inode_sb(sb, jffs2_inode_cachep, GFP_KERNEL);
if (!f)
return NULL;
return &f->vfs_inode;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 57ab424c05ff..27be2e8ba237 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -357,7 +357,8 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations jfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = jfs_readpage,
.readahead = jfs_readahead,
.writepage = jfs_writepage,
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index fde1a9cf902e..c4220ccdedef 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -552,22 +552,22 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
return ret;
}
-static void metapage_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void metapage_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- BUG_ON(offset || length < PAGE_SIZE);
+ BUG_ON(offset || length < folio_size(folio));
- BUG_ON(PageWriteback(page));
+ BUG_ON(folio_test_writeback(folio));
- metapage_releasepage(page, 0);
+ metapage_releasepage(&folio->page, 0);
}
const struct address_space_operations jfs_metapage_aops = {
.readpage = metapage_readpage,
.writepage = metapage_writepage,
.releasepage = metapage_releasepage,
- .invalidatepage = metapage_invalidatepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .invalidate_folio = metapage_invalidate_folio,
+ .dirty_folio = filemap_dirty_folio,
};
struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 24cbc9946e01..f1a13a74cddf 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -102,7 +102,7 @@ static struct inode *jfs_alloc_inode(struct super_block *sb)
{
struct jfs_inode_info *jfs_inode;
- jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS);
+ jfs_inode = alloc_inode_sb(sb, jfs_inode_cachep, GFP_NOFS);
if (!jfs_inode)
return NULL;
#ifdef CONFIG_QUOTA
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index dc3d061edda9..911444d21267 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -29,6 +29,7 @@
#include "mgmt/user_config.h"
#include "crypto_ctx.h"
#include "transport_ipc.h"
+#include "../smbfs_common/arc4.h"
/*
* Fixed format data defining GSS header and fixed string
@@ -336,6 +337,29 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
nt_len - CIFS_ENCPWD_SIZE,
domain_name, conn->ntlmssp.cryptkey);
kfree(domain_name);
+
+ /* The recovered secondary session key */
+ if (conn->ntlmssp.client_flags & NTLMSSP_NEGOTIATE_KEY_XCH) {
+ struct arc4_ctx *ctx_arc4;
+ unsigned int sess_key_off, sess_key_len;
+
+ sess_key_off = le32_to_cpu(authblob->SessionKey.BufferOffset);
+ sess_key_len = le16_to_cpu(authblob->SessionKey.Length);
+
+ if (blob_len < (u64)sess_key_off + sess_key_len)
+ return -EINVAL;
+
+ ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL);
+ if (!ctx_arc4)
+ return -ENOMEM;
+
+ cifs_arc4_setkey(ctx_arc4, sess->sess_key,
+ SMB2_NTLMV2_SESSKEY_SIZE);
+ cifs_arc4_crypt(ctx_arc4, sess->sess_key,
+ (char *)authblob + sess_key_off, sess_key_len);
+ kfree_sensitive(ctx_arc4);
+ }
+
return ret;
}
@@ -408,6 +432,9 @@ ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob,
(cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC))
flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+ if (cflags & NTLMSSP_NEGOTIATE_KEY_XCH)
+ flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
+
chgblob->NegotiateFlags = cpu_to_le32(flags);
len = strlen(ksmbd_netbios_name());
name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL);
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index 71bfb7de4472..ebe6ca08467a 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -241,7 +241,7 @@ struct ksmbd_rpc_command {
struct ksmbd_spnego_authen_request {
__u32 handle;
__u16 spnego_blob_len; /* the length of spnego_blob */
- __u8 spnego_blob[0]; /*
+ __u8 spnego_blob[]; /*
* the GSS token from SecurityBuffer of
* SMB2 SESSION SETUP request
*/
diff --git a/fs/ksmbd/ntlmssp.h b/fs/ksmbd/ntlmssp.h
index adaf4c0cbe8f..f13153c18b4e 100644
--- a/fs/ksmbd/ntlmssp.h
+++ b/fs/ksmbd/ntlmssp.h
@@ -95,7 +95,7 @@ struct security_buffer {
struct target_info {
__le16 Type;
__le16 Length;
- __u8 Content[0];
+ __u8 Content[];
} __packed;
struct negotiate_message {
@@ -108,7 +108,7 @@ struct negotiate_message {
* struct security_buffer for version info not present since we
* do not set the version is present flag
*/
- char DomainString[0];
+ char DomainString[];
/* followed by WorkstationString */
} __packed;
@@ -140,7 +140,7 @@ struct authenticate_message {
* struct security_buffer for version info not present since we
* do not set the version is present flag
*/
- char UserString[0];
+ char UserString[];
} __packed;
struct ntlmv2_resp {
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 1866c81c5c99..67e8e28e3fc3 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -2688,7 +2688,7 @@ int smb2_open(struct ksmbd_work *work)
(struct create_posix *)context;
if (le16_to_cpu(context->DataOffset) +
le32_to_cpu(context->DataLength) <
- sizeof(struct create_posix)) {
+ sizeof(struct create_posix) - 4) {
rc = -EINVAL;
goto err_out1;
}
@@ -3422,9 +3422,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
goto free_conv_name;
}
- struct_sz = readdir_info_level_struct_sz(info_level);
- next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
- KSMBD_DIR_INFO_ALIGNMENT);
+ struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len;
+ next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
+ d_info->last_entry_off_align = next_entry_offset - struct_sz;
if (next_entry_offset > d_info->out_buf_len) {
d_info->out_buf_len = 0;
@@ -3976,6 +3976,7 @@ int smb2_query_dir(struct ksmbd_work *work)
((struct file_directory_info *)
((char *)rsp->Buffer + d_info.last_entry_offset))
->NextEntryOffset = 0;
+ d_info.data_count -= d_info.last_entry_off_align;
rsp->StructureSize = cpu_to_le16(9);
rsp->OutputBufferOffset = cpu_to_le16(72);
@@ -6126,13 +6127,26 @@ static int smb2_set_remote_key_for_rdma(struct ksmbd_work *work,
__le16 ChannelInfoOffset,
__le16 ChannelInfoLength)
{
+ unsigned int i, ch_count;
+
if (work->conn->dialect == SMB30_PROT_ID &&
Channel != SMB2_CHANNEL_RDMA_V1)
return -EINVAL;
- if (ChannelInfoOffset == 0 ||
- le16_to_cpu(ChannelInfoLength) < sizeof(*desc))
+ ch_count = le16_to_cpu(ChannelInfoLength) / sizeof(*desc);
+ if (ksmbd_debug_types & KSMBD_DEBUG_RDMA) {
+ for (i = 0; i < ch_count; i++) {
+ pr_info("RDMA r/w request %#x: token %#x, length %#x\n",
+ i,
+ le32_to_cpu(desc[i].token),
+ le32_to_cpu(desc[i].length));
+ }
+ }
+ if (ch_count != 1) {
+ ksmbd_debug(RDMA, "RDMA multiple buffer descriptors %d are not supported yet\n",
+ ch_count);
return -EINVAL;
+ }
work->need_invalidate_rkey =
(Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE);
@@ -6185,9 +6199,15 @@ int smb2_read(struct ksmbd_work *work)
if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
req->Channel == SMB2_CHANNEL_RDMA_V1) {
+ unsigned int ch_offset = le16_to_cpu(req->ReadChannelInfoOffset);
+
+ if (ch_offset < offsetof(struct smb2_read_req, Buffer)) {
+ err = -EINVAL;
+ goto out;
+ }
err = smb2_set_remote_key_for_rdma(work,
(struct smb2_buffer_desc_v1 *)
- &req->Buffer[0],
+ ((char *)req + ch_offset),
req->Channel,
req->ReadChannelInfoOffset,
req->ReadChannelInfoLength);
@@ -6428,11 +6448,16 @@ int smb2_write(struct ksmbd_work *work)
if (req->Channel == SMB2_CHANNEL_RDMA_V1 ||
req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE) {
- if (req->Length != 0 || req->DataOffset != 0)
- return -EINVAL;
+ unsigned int ch_offset = le16_to_cpu(req->WriteChannelInfoOffset);
+
+ if (req->Length != 0 || req->DataOffset != 0 ||
+ ch_offset < offsetof(struct smb2_write_req, Buffer)) {
+ err = -EINVAL;
+ goto out;
+ }
err = smb2_set_remote_key_for_rdma(work,
(struct smb2_buffer_desc_v1 *)
- &req->Buffer[0],
+ ((char *)req + ch_offset),
req->Channel,
req->WriteChannelInfoOffset,
req->WriteChannelInfoLength);
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 725b800c29c8..d49468426576 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -759,7 +759,7 @@ struct smb2_file_rename_info { /* encoding of request for level 10 */
__u8 Reserved[7];
__u64 RootDirectory; /* MBZ for network operations (why says spec?) */
__le32 FileNameLength;
- char FileName[0]; /* New name to be assigned */
+ char FileName[]; /* New name to be assigned */
} __packed; /* level 10 Set */
struct smb2_file_link_info { /* encoding of request for level 11 */
@@ -768,7 +768,7 @@ struct smb2_file_link_info { /* encoding of request for level 11 */
__u8 Reserved[7];
__u64 RootDirectory; /* MBZ for network operations (why says spec?) */
__le32 FileNameLength;
- char FileName[0]; /* Name to be assigned to new link */
+ char FileName[]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
/*
@@ -810,7 +810,7 @@ struct smb2_file_basic_info { /* data block encoding of response to level 18 */
struct smb2_file_alt_name_info {
__le32 FileNameLength;
- char FileName[0];
+ char FileName[];
} __packed;
struct smb2_file_stream_info {
@@ -818,7 +818,7 @@ struct smb2_file_stream_info {
__le32 StreamNameLength;
__le64 StreamSize;
__le64 StreamAllocationSize;
- char StreamName[0];
+ char StreamName[];
} __packed;
struct smb2_file_eof_info { /* encoding of request for level 10 */
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index ef7f42b0290a..9a7e211dbf4f 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -308,14 +308,17 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
for (i = 0; i < 2; i++) {
struct kstat kstat;
struct ksmbd_kstat ksmbd_kstat;
+ struct dentry *dentry;
if (!dir->dot_dotdot[i]) { /* fill dot entry info */
if (i == 0) {
d_info->name = ".";
d_info->name_len = 1;
+ dentry = dir->filp->f_path.dentry;
} else {
d_info->name = "..";
d_info->name_len = 2;
+ dentry = dir->filp->f_path.dentry->d_parent;
}
if (!match_pattern(d_info->name, d_info->name_len,
@@ -327,7 +330,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
ksmbd_kstat.kstat = &kstat;
ksmbd_vfs_fill_dentry_attrs(work,
user_ns,
- dir->filp->f_path.dentry->d_parent,
+ dentry,
&ksmbd_kstat);
rc = fn(conn, info_level, d_info, &ksmbd_kstat);
if (rc)
diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c
index 3c1ec1ac0b27..e646d79554b8 100644
--- a/fs/ksmbd/transport_rdma.c
+++ b/fs/ksmbd/transport_rdma.c
@@ -80,7 +80,7 @@ static int smb_direct_max_fragmented_recv_size = 1024 * 1024;
/* The maximum single-message size which can be received */
static int smb_direct_max_receive_size = 8192;
-static int smb_direct_max_read_write_size = 1048512;
+static int smb_direct_max_read_write_size = 524224;
static int smb_direct_max_outstanding_rw_ops = 8;
@@ -211,7 +211,7 @@ struct smb_direct_rdma_rw_msg {
struct completion *completion;
struct rdma_rw_ctx rw_ctx;
struct sg_table sgt;
- struct scatterlist sg_list[0];
+ struct scatterlist sg_list[];
};
static inline int get_buf_page_count(void *buf, int size)
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index adf94a4f22fa..8c37aaf936ab 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -47,6 +47,7 @@ struct ksmbd_dir_info {
int last_entry_offset;
bool hide_dot_file;
int flags;
+ int last_entry_off_align;
};
struct ksmbd_readdir_data {
diff --git a/fs/ksmbd/xattr.h b/fs/ksmbd/xattr.h
index 8857c01093d9..16499ca5c82d 100644
--- a/fs/ksmbd/xattr.h
+++ b/fs/ksmbd/xattr.h
@@ -76,7 +76,7 @@ struct xattr_acl_entry {
struct xattr_smb_acl {
int count;
int next;
- struct xattr_acl_entry entries[0];
+ struct xattr_acl_entry entries[];
};
/* 64bytes hash in xattr_ntacl is computed with sha256 */
diff --git a/fs/libfs.c b/fs/libfs.c
index ba7438ab9371..e64bdedef168 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -631,7 +631,7 @@ const struct address_space_operations ram_aops = {
.readpage = simple_readpage,
.write_begin = simple_write_begin,
.write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
};
EXPORT_SYMBOL(ram_aops);
@@ -1198,17 +1198,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-void noop_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- /*
- * There is no page cache to invalidate in the dax case, however
- * we need this callback defined to prevent falling back to
- * block_invalidatepage() in do_invalidatepage().
- */
-}
-EXPORT_SYMBOL_GPL(noop_invalidatepage);
-
ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
/*
@@ -1231,7 +1220,7 @@ EXPORT_SYMBOL(kfree_link);
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
};
struct inode *inode = new_inode_pseudo(s);
@@ -1379,7 +1368,7 @@ bool is_empty_dir_inode(struct inode *inode)
(inode->i_op == &empty_dir_inode_operations);
}
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
/*
* Determine if the name of a dentry should be casefolded.
*
@@ -1473,7 +1462,7 @@ static const struct dentry_operations generic_encrypted_dentry_ops = {
};
#endif
-#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
@@ -1508,10 +1497,10 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
#ifdef CONFIG_FS_ENCRYPTION
bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
bool needs_ci_ops = dentry->d_sb->s_encoding;
#endif
-#if defined(CONFIG_FS_ENCRYPTION) && defined(CONFIG_UNICODE)
+#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
if (needs_encrypt_ops && needs_ci_ops) {
d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
return;
@@ -1523,7 +1512,7 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
return;
}
#endif
-#ifdef CONFIG_UNICODE
+#if IS_ENABLED(CONFIG_UNICODE)
if (needs_ci_ops) {
d_set_d_op(dentry, &generic_ci_dentry_ops);
return;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 0475c5a5d061..59ef8a1f843f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -184,8 +184,7 @@ lockd(void *vrqstp)
dprintk("lockd_down: service stopped\n");
svc_exit_thread(rqstp);
-
- module_put_and_kthread_exit(0);
+ return 0;
}
static int create_lockd_listener(struct svc_serv *serv, const char *name,
@@ -197,8 +196,8 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
xprt = svc_find_xprt(serv, name, net, family, 0);
if (xprt == NULL)
- return svc_create_xprt(serv, name, net, family, port,
- SVC_SOCK_DEFAULTS, cred);
+ return svc_xprt_create(serv, name, net, family, port,
+ SVC_SOCK_DEFAULTS, cred);
svc_xprt_put(xprt);
return 0;
}
@@ -248,7 +247,8 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
return err;
}
@@ -286,9 +286,8 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
nlm_shutdown_hosts_net(net);
cancel_delayed_work_sync(&ln->grace_period_end);
locks_end_grace(&ln->lockd_manager);
- svc_shutdown_net(serv, net);
- dprintk("%s: per-net data destroyed; net=%x\n",
- __func__, net->ns.inum);
+ svc_xprt_destroy_all(serv, net);
+ svc_rpcb_cleanup(serv, net);
}
} else {
pr_err("%s: no users! net=%x\n",
@@ -350,13 +349,6 @@ static struct notifier_block lockd_inet6addr_notifier = {
};
#endif
-static const struct svc_serv_ops lockd_sv_ops = {
- .svo_shutdown = svc_rpcb_cleanup,
- .svo_function = lockd,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
static int lockd_get(void)
{
struct svc_serv *serv;
@@ -380,7 +372,7 @@ static int lockd_get(void)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
return -ENOMEM;
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index cb3a7512c33e..0a22a2faf552 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -179,19 +179,21 @@ nlm_delete_file(struct nlm_file *file)
static int nlm_unlock_files(struct nlm_file *file)
{
struct file_lock lock;
- struct file *f;
+ locks_init_lock(&lock);
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
- for (f = file->f_file[0]; f <= file->f_file[1]; f++) {
- if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) {
- pr_warn("lockd: unlock failure in %s:%d\n",
- __FILE__, __LINE__);
- return 1;
- }
- }
+ if (file->f_file[O_RDONLY] &&
+ vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL))
+ goto out_err;
+ if (file->f_file[O_WRONLY] &&
+ vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, &lock, NULL))
+ goto out_err;
return 0;
+out_err:
+ pr_warn("lockd: unlock failure in %s:%d\n", __FILE__, __LINE__);
+ return 1;
}
/*
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index a71f1cf894b9..f1a6610e4ee6 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -63,7 +63,7 @@ static struct kmem_cache * minix_inode_cachep;
static struct inode *minix_alloc_inode(struct super_block *sb)
{
struct minix_inode_info *ei;
- ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, minix_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -442,12 +442,14 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
}
static const struct address_space_operations minix_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = minix_readpage,
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
- .bmap = minix_bmap
+ .bmap = minix_bmap,
+ .direct_IO = noop_direct_IO
};
static const struct inode_operations minix_symlink_inode_operations = {
diff --git a/fs/mpage.c b/fs/mpage.c
index 2492b30192c0..1fe56f8c495f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -477,7 +477,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
if (!buffer_mapped(bh)) {
/*
* unmapped dirty buffers are created by
- * __set_page_dirty_buffers -> mmapped data
+ * block_dirty_folio -> mmapped data
*/
if (buffer_dirty(bh))
goto confused;
@@ -586,7 +586,6 @@ alloc_new:
GFP_NOFS);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
wbc_init_bio(wbc, bio);
- bio->bi_write_hint = inode->i_write_hint;
}
/*
diff --git a/fs/namespace.c b/fs/namespace.c
index 40b994a29e90..6e9844b8c6fb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -344,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m)
* incremented count after it has set MNT_WRITE_HOLD.
*/
smp_mb();
- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
- cpu_relax();
+ might_lock(&mount_lock.lock);
+ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ cpu_relax();
+ } else {
+ /*
+ * This prevents priority inversion, if the task
+ * setting MNT_WRITE_HOLD got preempted on a remote
+ * CPU, and it prevents life lock if the task setting
+ * MNT_WRITE_HOLD has a lower priority and is bound to
+ * the same CPU as the task that is spinning here.
+ */
+ preempt_enable();
+ lock_mount_hash();
+ unlock_mount_hash();
+ preempt_disable();
+ }
+ }
/*
* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
* be set to match its requirements. So we must not load that until
@@ -469,6 +485,24 @@ void mnt_drop_write_file(struct file *file)
}
EXPORT_SYMBOL(mnt_drop_write_file);
+/**
+ * mnt_hold_writers - prevent write access to the given mount
+ * @mnt: mnt to prevent write access to
+ *
+ * Prevents write access to @mnt if there are no active writers for @mnt.
+ * This function needs to be called and return successfully before changing
+ * properties of @mnt that need to remain stable for callers with write access
+ * to @mnt.
+ *
+ * After this functions has been called successfully callers must pair it with
+ * a call to mnt_unhold_writers() in order to stop preventing write access to
+ * @mnt.
+ *
+ * Context: This function expects lock_mount_hash() to be held serializing
+ * setting MNT_WRITE_HOLD.
+ * Return: On success 0 is returned.
+ * On error, -EBUSY is returned.
+ */
static inline int mnt_hold_writers(struct mount *mnt)
{
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -500,6 +534,18 @@ static inline int mnt_hold_writers(struct mount *mnt)
return 0;
}
+/**
+ * mnt_unhold_writers - stop preventing write access to the given mount
+ * @mnt: mnt to stop preventing write access to
+ *
+ * Stop preventing write access to @mnt allowing callers to gain write access
+ * to @mnt again.
+ *
+ * This function can only be called after a successful call to
+ * mnt_hold_writers().
+ *
+ * Context: This function expects lock_mount_hash() to be held.
+ */
static inline void mnt_unhold_writers(struct mount *mnt)
{
/*
@@ -533,12 +579,9 @@ int sb_prepare_remount_readonly(struct super_block *sb)
lock_mount_hash();
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
- mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
- smp_mb();
- if (mnt_get_writers(mnt) > 0) {
- err = -EBUSY;
+ err = mnt_hold_writers(mnt);
+ if (err)
break;
- }
}
}
if (!err && atomic_long_read(&sb->s_remove_count))
@@ -2567,6 +2610,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
struct super_block *sb = mnt->mnt_sb;
if (!__mnt_is_readonly(mnt) &&
+ (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
(ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
char *buf = (char *)__get_free_page(GFP_KERNEL);
char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
@@ -2581,6 +2625,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
tm.tm_year+1900, (unsigned long long)sb->s_time_max);
free_page((unsigned long)buf);
+ sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
}
}
@@ -3968,46 +4013,57 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return 0;
}
-static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
- struct mount *mnt, int *err)
+/**
+ * mnt_allow_writers() - check whether the attribute change allows writers
+ * @kattr: the new mount attributes
+ * @mnt: the mount to which @kattr will be applied
+ *
+ * Check whether thew new mount attributes in @kattr allow concurrent writers.
+ *
+ * Return: true if writers need to be held, false if not
+ */
+static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
+ const struct mount *mnt)
{
- struct mount *m = mnt, *last = NULL;
+ return !(kattr->attr_set & MNT_READONLY) ||
+ (mnt->mnt.mnt_flags & MNT_READONLY);
+}
- if (!is_mounted(&m->mnt)) {
- *err = -EINVAL;
- goto out;
- }
+static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
+{
+ struct mount *m;
+ int err;
- if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
- *err = -EINVAL;
- goto out;
- }
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
+ err = -EPERM;
+ break;
+ }
- do {
- unsigned int flags;
+ err = can_idmap_mount(kattr, m);
+ if (err)
+ break;
- flags = recalc_flags(kattr, m);
- if (!can_change_locked_flags(m, flags)) {
- *err = -EPERM;
- goto out;
+ if (!mnt_allow_writers(kattr, m)) {
+ err = mnt_hold_writers(m);
+ if (err)
+ break;
}
- *err = can_idmap_mount(kattr, m);
- if (*err)
- goto out;
+ if (!kattr->recurse)
+ return 0;
+ }
- last = m;
+ if (err) {
+ struct mount *p;
- if ((kattr->attr_set & MNT_READONLY) &&
- !(m->mnt.mnt_flags & MNT_READONLY)) {
- *err = mnt_hold_writers(m);
- if (*err)
- goto out;
+ for (p = mnt; p != m; p = next_mnt(p, mnt)) {
+ /* If we had to hold writers unblock them. */
+ if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
+ mnt_unhold_writers(p);
}
- } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
-out:
- return last;
+ }
+ return err;
}
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
@@ -4035,48 +4091,32 @@ static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
put_user_ns(old_mnt_userns);
}
-static void mount_setattr_commit(struct mount_kattr *kattr,
- struct mount *mnt, struct mount *last,
- int err)
+static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{
- struct mount *m = mnt;
+ struct mount *m;
- do {
- if (!err) {
- unsigned int flags;
+ for (m = mnt; m; m = next_mnt(m, mnt)) {
+ unsigned int flags;
- do_idmap_mount(kattr, m);
- flags = recalc_flags(kattr, m);
- WRITE_ONCE(m->mnt.mnt_flags, flags);
- }
+ do_idmap_mount(kattr, m);
+ flags = recalc_flags(kattr, m);
+ WRITE_ONCE(m->mnt.mnt_flags, flags);
- /*
- * We either set MNT_READONLY above so make it visible
- * before ~MNT_WRITE_HOLD or we failed to recursively
- * apply mount options.
- */
- if ((kattr->attr_set & MNT_READONLY) &&
- (m->mnt.mnt_flags & MNT_WRITE_HOLD))
+ /* If we had to hold writers unblock them. */
+ if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt_unhold_writers(m);
- if (!err && kattr->propagation)
+ if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
-
- /*
- * On failure, only cleanup until we found the first mount
- * we failed to handle.
- */
- if (err && m == last)
+ if (!kattr->recurse)
break;
- } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
- if (!err)
- touch_mnt_namespace(mnt->mnt_ns);
+ }
+ touch_mnt_namespace(mnt->mnt_ns);
}
static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{
- struct mount *mnt = real_mount(path->mnt), *last = NULL;
+ struct mount *mnt = real_mount(path->mnt);
int err = 0;
if (path->dentry != mnt->mnt.mnt_root)
@@ -4097,16 +4137,32 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
}
}
+ err = -EINVAL;
lock_mount_hash();
+ /* Ensure that this isn't anything purely vfs internal. */
+ if (!is_mounted(&mnt->mnt))
+ goto out;
+
+ /*
+ * If this is an attached mount make sure it's located in the callers
+ * mount namespace. If it's not don't let the caller interact with it.
+ * If this is a detached mount make sure it has an anonymous mount
+ * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
+ */
+ if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+ goto out;
+
/*
- * Get the mount tree in a shape where we can change mount
- * properties without failure.
+ * First, we get the mount tree in a shape where we can change mount
+ * properties without failure. If we succeeded to do so we commit all
+ * changes and if we failed we clean up.
*/
- last = mount_setattr_prepare(kattr, mnt, &err);
- if (last) /* Commit all changes or revert to the old state. */
- mount_setattr_commit(kattr, mnt, last, err);
+ err = mount_setattr_prepare(kattr, mnt);
+ if (!err)
+ mount_setattr_commit(kattr, mnt);
+out:
unlock_mount_hash();
if (kattr->propagation) {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 054cc1255fac..456af7d230cf 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,7 +17,6 @@
#include <linux/errno.h>
#include <linux/mutex.h>
#include <linux/freezer.h>
-#include <linux/kthread.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/bc_xprt.h>
@@ -45,18 +44,18 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
int ret;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret <= 0)
goto out_err;
nn->nfs_callback_tcpport = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
- ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
- cred);
+ ret = svc_xprt_create(serv, "tcp", net, PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+ cred);
if (ret > 0) {
nn->nfs_callback_tcpport6 = ret;
dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
@@ -92,8 +91,8 @@ nfs4_callback_svc(void *vrqstp)
continue;
svc_process(rqstp);
}
+
svc_exit_thread(rqstp);
- module_put_and_kthread_exit(0);
return 0;
}
@@ -136,8 +135,8 @@ nfs41_callback_svc(void *vrqstp)
finish_wait(&serv->sv_cb_waitq, &wq);
}
}
+
svc_exit_thread(rqstp);
- module_put_and_kthread_exit(0);
return 0;
}
@@ -189,7 +188,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
return;
dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
- svc_shutdown_net(serv, net);
+ svc_xprt_destroy_all(serv, net);
}
static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
@@ -232,33 +231,10 @@ err_bind:
return ret;
}
-static const struct svc_serv_ops nfs40_cb_sv_ops = {
- .svo_function = nfs4_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-#if defined(CONFIG_NFS_V4_1)
-static const struct svc_serv_ops nfs41_cb_sv_ops = {
- .svo_function = nfs41_callback_svc,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = &nfs41_cb_sv_ops,
-};
-#else
-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
- [0] = &nfs40_cb_sv_ops,
- [1] = NULL,
-};
-#endif
-
static struct svc_serv *nfs_callback_create_svc(int minorversion)
{
struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
- const struct svc_serv_ops *sv_ops;
+ int (*threadfn)(void *data);
struct svc_serv *serv;
/*
@@ -267,17 +243,6 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
if (cb_info->serv)
return svc_get(cb_info->serv);
- switch (minorversion) {
- case 0:
- sv_ops = nfs4_cb_sv_ops[0];
- break;
- default:
- sv_ops = nfs4_cb_sv_ops[1];
- }
-
- if (sv_ops == NULL)
- return ERR_PTR(-ENOTSUPP);
-
/*
* Sanity check: if there's no task,
* we should be the first user ...
@@ -286,7 +251,16 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
cb_info->users);
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
+ threadfn = nfs4_callback_svc;
+#if defined(CONFIG_NFS_V4_1)
+ if (minorversion)
+ threadfn = nfs41_callback_svc;
+#else
+ if (minorversion)
+ return ERR_PTR(-ENOTSUPP);
+#endif
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ threadfn);
if (!serv) {
printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
return ERR_PTR(-ENOMEM);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f18e80fda9bf..d1f34229e11a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -177,6 +177,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
INIT_LIST_HEAD(&clp->cl_superblocks);
clp->cl_rpcclient = ERR_PTR(-EINVAL);
+ clp->cl_flags = cl_init->init_flags;
clp->cl_proto = cl_init->proto;
clp->cl_nconnect = cl_init->nconnect;
clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
@@ -423,7 +424,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
list_add_tail(&new->cl_share_link,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
- new->cl_flags = cl_init->init_flags;
return rpc_ops->init_client(new, cl_init);
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 848f3b8fb821..75cb1cbe4cde 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -80,6 +80,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
ctx->dir_cookie = 0;
ctx->dup_cookie = 0;
ctx->page_index = 0;
+ ctx->eof = false;
spin_lock(&dir->i_lock);
if (list_empty(&nfsi->open_files) &&
(nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
@@ -168,6 +169,7 @@ struct nfs_readdir_descriptor {
unsigned int cache_entry_index;
signed char duped;
bool plus;
+ bool eob;
bool eof;
};
@@ -867,7 +869,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc,
status = nfs_readdir_page_filler(desc, entry, pages, pglen,
arrays, narrays);
- } while (!status && nfs_readdir_page_needs_filling(page));
+ } while (!status && nfs_readdir_page_needs_filling(page) &&
+ page_mapping(page));
nfs_readdir_free_pages(pages, array_size);
out:
@@ -988,7 +991,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
ent = &array->array[i];
if (!dir_emit(desc->ctx, ent->name, ent->name_len,
nfs_compat_user_ino64(ent->ino), ent->d_type)) {
- desc->eof = true;
+ desc->eob = true;
break;
}
memcpy(desc->verf, verf, sizeof(desc->verf));
@@ -1004,7 +1007,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
desc->duped = 1;
}
if (array->page_is_eof)
- desc->eof = true;
+ desc->eof = !desc->eob;
kunmap(desc->page);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n",
@@ -1041,12 +1044,13 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc)
goto out;
desc->page_index = 0;
+ desc->cache_entry_index = 0;
desc->last_cookie = desc->dir_cookie;
desc->duped = 0;
status = nfs_readdir_xdr_to_array(desc, desc->verf, verf, arrays, sz);
- for (i = 0; !desc->eof && i < sz && arrays[i]; i++) {
+ for (i = 0; !desc->eob && i < sz && arrays[i]; i++) {
desc->page = arrays[i];
nfs_do_filldir(desc, verf);
}
@@ -1105,9 +1109,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->duped = dir_ctx->duped;
page_index = dir_ctx->page_index;
desc->attr_gencount = dir_ctx->attr_gencount;
+ desc->eof = dir_ctx->eof;
memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
spin_unlock(&file->f_lock);
+ if (desc->eof) {
+ res = 0;
+ goto out_free;
+ }
+
if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
list_is_singular(&nfsi->open_files))
invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
@@ -1141,7 +1151,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
nfs_do_filldir(desc, nfsi->cookieverf);
nfs_readdir_page_unlock_and_put_cached(desc);
- } while (!desc->eof);
+ } while (!desc->eob && !desc->eof);
spin_lock(&file->f_lock);
dir_ctx->dir_cookie = desc->dir_cookie;
@@ -1149,9 +1159,10 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
dir_ctx->duped = desc->duped;
dir_ctx->attr_gencount = desc->attr_gencount;
dir_ctx->page_index = desc->page_index;
+ dir_ctx->eof = desc->eof;
memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
spin_unlock(&file->f_lock);
-
+out_free:
kfree(desc);
out:
@@ -1193,6 +1204,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
if (offset == 0)
memset(dir_ctx->verf, 0, sizeof(dir_ctx->verf));
dir_ctx->duped = 0;
+ dir_ctx->eof = false;
}
spin_unlock(&filp->f_lock);
return offset;
@@ -1998,14 +2010,14 @@ no_open:
if (!res) {
inode = d_inode(dentry);
if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
- !S_ISDIR(inode->i_mode))
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)))
res = ERR_PTR(-ENOTDIR);
else if (inode && S_ISREG(inode->i_mode))
res = ERR_PTR(-EOPENSTALE);
} else if (!IS_ERR(res)) {
inode = d_inode(res);
if ((lookup_flags & LOOKUP_DIRECTORY) && inode &&
- !S_ISDIR(inode->i_mode)) {
+ !(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) {
dput(res);
res = ERR_PTR(-ENOTDIR);
} else if (inode && S_ISREG(inode->i_mode)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 76d76acbc594..2df2a5392737 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -406,17 +406,17 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
* - Called if either PG_private or PG_fscache is set on the page
* - Caller holds page lock
*/
-static void nfs_invalidate_page(struct page *page, unsigned int offset,
- unsigned int length)
+static void nfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
- page, offset, length);
+ dfprintk(PAGECACHE, "NFS: invalidate_folio(%lu, %zu, %zu)\n",
+ folio->index, offset, length);
- if (offset != 0 || length < PAGE_SIZE)
+ if (offset != 0 || length < folio_size(folio))
return;
/* Cancel any unstarted writes on this page */
- nfs_wb_page_cancel(page_file_mapping(page)->host, page);
- wait_on_page_fscache(page);
+ nfs_wb_folio_cancel(folio->mapping->host, folio);
+ folio_wait_fscache(folio);
}
/*
@@ -472,15 +472,15 @@ static void nfs_check_dirty_writeback(struct page *page,
* - Caller holds page lock
* - Return 0 if successful, -error otherwise
*/
-static int nfs_launder_page(struct page *page)
+static int nfs_launder_folio(struct folio *folio)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = folio->mapping->host;
- dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
- inode->i_ino, (long long)page_offset(page));
+ dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
+ inode->i_ino, folio_pos(folio));
- wait_on_page_fscache(page);
- return nfs_wb_page(inode, page);
+ folio_wait_fscache(folio);
+ return nfs_wb_page(inode, &folio->page);
}
static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -515,18 +515,18 @@ static void nfs_swap_deactivate(struct file *file)
const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
.readpages = nfs_readpages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.writepage = nfs_writepage,
.writepages = nfs_writepages,
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
- .invalidatepage = nfs_invalidate_page,
+ .invalidate_folio = nfs_invalidate_folio,
.releasepage = nfs_release_page,
.direct_IO = nfs_direct_IO,
#ifdef CONFIG_MIGRATION
.migratepage = nfs_migrate_page,
#endif
- .launder_page = nfs_launder_page,
+ .launder_folio = nfs_launder_folio,
.is_dirty_writeback = nfs_check_dirty_writeback,
.error_remove_page = generic_error_remove_page,
.swap_activate = nfs_swap_activate,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a918c3a834b6..3351c2de3e08 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -853,12 +853,9 @@ int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
}
/* Flush out writes to the server in order to update c/mtime. */
- if ((request_mask & (STATX_CTIME|STATX_MTIME)) &&
- S_ISREG(inode->i_mode)) {
- err = filemap_write_and_wait(inode->i_mapping);
- if (err)
- goto out;
- }
+ if ((request_mask & (STATX_CTIME | STATX_MTIME)) &&
+ S_ISREG(inode->i_mode))
+ filemap_write_and_wait(inode->i_mapping);
/*
* We may force a getattr if the user cares about atime.
@@ -2241,7 +2238,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
struct inode *nfs_alloc_inode(struct super_block *sb)
{
struct nfs_inode *nfsi;
- nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL);
+ nfsi = alloc_inode_sb(sb, nfs_inode_cachep, GFP_KERNEL);
if (!nfsi)
return NULL;
nfsi->flags = 0UL;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b18f31b2c9e7..0e0db6c27619 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1229,8 +1229,7 @@ nfs4_update_changeattr_locked(struct inode *inode,
NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
- NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR |
- NFS_INO_REVAL_PAGECACHE;
+ NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR;
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
}
nfsi->attrtimeo_timestamp = jiffies;
@@ -8032,7 +8031,8 @@ static int _nfs41_proc_get_locations(struct nfs_server *server,
/**
* nfs4_proc_get_locations - discover locations for a migrated FSID
- * @inode: inode on FSID that is migrating
+ * @server: pointer to nfs_server to process
+ * @fhandle: pointer to the kernel NFS client file handle
* @locations: result of query
* @page: buffer
* @cred: credential to use for this operation
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f5a62c0d999b..02a899e4390f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2697,6 +2697,5 @@ static int nfs4_run_state_manager(void *ptr)
allow_signal(SIGKILL);
nfs4_state_manager(clp);
nfs_put_client(clp);
- module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 987a187bd39a..614e2809032e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -417,7 +417,7 @@ static void nfs_set_page_writeback(struct page *page)
if (atomic_long_inc_return(&nfss->writeback) >
NFS_CONGESTION_ON_THRESH)
- set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ nfss->write_congested = 1;
}
static void nfs_end_page_writeback(struct nfs_page *req)
@@ -433,7 +433,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+ nfss->write_congested = 0;
}
/*
@@ -672,6 +672,10 @@ static int nfs_writepage_locked(struct page *page,
struct inode *inode = page_file_mapping(page)->host;
int err;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return AOP_WRITEPAGE_ACTIVATE;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
@@ -719,6 +723,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
int priority = 0;
int err;
+ if (wbc->sync_mode == WB_SYNC_NONE &&
+ NFS_SERVER(inode)->write_congested)
+ return 0;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
@@ -1893,7 +1901,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
}
nfss = NFS_SERVER(data->inode);
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
+ nfss->write_congested = 0;
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
@@ -2049,21 +2057,21 @@ out:
}
EXPORT_SYMBOL_GPL(nfs_wb_all);
-int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio)
{
struct nfs_page *req;
int ret = 0;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
/* blocking call to cancel all requests and join to a single (head)
* request */
- req = nfs_lock_and_join_requests(page);
+ req = nfs_lock_and_join_requests(&folio->page);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
} else if (req) {
- /* all requests from this page have been cancelled by
+ /* all requests from this folio have been cancelled by
* nfs_lock_and_join_requests, so just remove the head
* request from the inode / page_private pointer and
* release it */
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 3d1d17256a91..f6a2fd3015e7 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -35,18 +35,9 @@ config NFSD_V2_ACL
bool
depends on NFSD
-config NFSD_V3
- bool "NFS server support for NFS version 3"
- depends on NFSD
- help
- This option enables support in your system's NFS server for
- version 3 of the NFS protocol (RFC 1813).
-
- If unsure, say Y.
-
config NFSD_V3_ACL
bool "NFS server support for the NFSv3 ACL protocol extension"
- depends on NFSD_V3
+ depends on NFSD
select NFSD_V2_ACL
help
Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
@@ -70,7 +61,6 @@ config NFSD_V3_ACL
config NFSD_V4
bool "NFS server support for NFS version 4"
depends on NFSD && PROC_FS
- select NFSD_V3
select FS_POSIX_ACL
select SUNRPC_GSS
select CRYPTO
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 3f0983e93a99..805c06d5f1b4 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,9 +12,8 @@ nfsd-y += trace.o
nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o \
- stats.o filecache.o
+ stats.o filecache.o nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
-nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 8bc807c5fea4..c08882f5867b 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -7,6 +7,7 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/file.h>
+#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/list_lru.h>
#include <linux/fsnotify_backend.h>
@@ -632,7 +633,7 @@ nfsd_file_cache_init(void)
if (!nfsd_filecache_wq)
goto out;
- nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+ nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
if (!nfsd_file_hashtbl) {
pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
@@ -700,7 +701,7 @@ out_err:
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
@@ -811,7 +812,7 @@ nfsd_file_cache_shutdown(void)
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index 2e2f1d5e9f62..070f90ed09b6 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -117,7 +117,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
da->netaddr.addr_len =
snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
- "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+ "%s.%d.%d", addr, port >> 8, port & 0xff);
da->tightly_coupled = false;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 8ef53f6726ec..936eebd4c56d 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -150,13 +150,17 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
unsigned int len;
int v;
- argp->count = min_t(u32, argp->count, max_blocksize);
-
dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
SVCFH_fmt(&argp->fh),
(unsigned long) argp->count,
(unsigned long long) argp->offset);
+ argp->count = min_t(u32, argp->count, max_blocksize);
+ if (argp->offset > (u64)OFFSET_MAX)
+ argp->offset = (u64)OFFSET_MAX;
+ if (argp->offset + argp->count > (u64)OFFSET_MAX)
+ argp->count = (u64)OFFSET_MAX - argp->offset;
+
v = 0;
len = argp->count;
resp->pages = rqstp->rq_next_page;
@@ -199,6 +203,11 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
(unsigned long long) argp->offset,
argp->stable? " stable" : "");
+ resp->status = nfserr_fbig;
+ if (argp->offset > (u64)OFFSET_MAX ||
+ argp->offset + argp->len > (u64)OFFSET_MAX)
+ return rpc_success;
+
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
nvecs = svc_fill_write_vector(rqstp, &argp->payload);
@@ -651,15 +660,9 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
argp->count,
(unsigned long long) argp->offset);
- if (argp->offset > NFS_OFFSET_MAX) {
- resp->status = nfserr_inval;
- goto out;
- }
-
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
argp->count, resp->verf);
-out:
return rpc_success;
}
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7c45ba4db61b..0293b8d65f10 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -254,7 +254,7 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
if (xdr_stream_decode_u64(xdr, &newsize) < 0)
return false;
iap->ia_valid |= ATTR_SIZE;
- iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
+ iap->ia_size = newsize;
}
if (xdr_stream_decode_u32(xdr, &set_it) < 0)
return false;
@@ -1060,7 +1060,7 @@ svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
return false;
/* cookie */
resp->cookie_offset = dirlist->len;
- if (xdr_stream_encode_u64(xdr, NFS_OFFSET_MAX) < 0)
+ if (xdr_stream_encode_u64(xdr, OFFSET_MAX) < 0)
return false;
return true;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 6d1b5bb051c5..2c05692a9abf 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -422,7 +422,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
if (!new)
return nfserr_jukebox;
- memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+ memcpy(&new->lo_seg, seg, sizeof(new->lo_seg));
new->lo_state = ls;
spin_lock(&fp->fi_lock);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ed1ee25647be..b207c76a873f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -782,12 +782,16 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
__be32 status;
read->rd_nf = NULL;
- if (read->rd_offset >= OFFSET_MAX)
- return nfserr_inval;
trace_nfsd_read_start(rqstp, &cstate->current_fh,
read->rd_offset, read->rd_length);
+ read->rd_length = min_t(u32, read->rd_length, svc_max_payload(rqstp));
+ if (read->rd_offset > (u64)OFFSET_MAX)
+ read->rd_offset = (u64)OFFSET_MAX;
+ if (read->rd_offset + read->rd_length > (u64)OFFSET_MAX)
+ read->rd_length = (u64)OFFSET_MAX - read->rd_offset;
+
/*
* If we do a zero copy read, then a client will see read data
* that reflects the state of the file *after* performing the
@@ -1018,8 +1022,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unsigned long cnt;
int nvecs;
- if (write->wr_offset >= OFFSET_MAX)
- return nfserr_inval;
+ if (write->wr_offset > (u64)OFFSET_MAX ||
+ write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX)
+ return nfserr_fbig;
cnt = write->wr_buflen;
trace_nfsd_write_start(rqstp, &cstate->current_fh,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 72900b89cf84..234e852fcdfa 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4130,8 +4130,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
status = nfserr_clid_inuse;
if (client_has_state(old)
&& !same_creds(&unconf->cl_cred,
- &old->cl_cred))
+ &old->cl_cred)) {
+ old = NULL;
goto out;
+ }
status = mark_client_expired_locked(old);
if (status) {
old = NULL;
@@ -4709,6 +4711,14 @@ nfsd_break_deleg_cb(struct file_lock *fl)
return ret;
}
+/**
+ * nfsd_breaker_owns_lease - Check if lease conflict was resolved
+ * @fl: Lock state to check
+ *
+ * Return values:
+ * %true: Lease conflict was resolved
+ * %false: Lease conflict was not resolved.
+ */
static bool nfsd_breaker_owns_lease(struct file_lock *fl)
{
struct nfs4_delegation *dl = fl->fl_owner;
@@ -4716,11 +4726,11 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
struct nfs4_client *clp;
if (!i_am_nfsd())
- return NULL;
+ return false;
rqst = kthread_data(current);
/* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
- return NULL;
+ return false;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
}
@@ -6524,7 +6534,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
}
static fl_owner_t
-nfsd4_fl_get_owner(fl_owner_t owner)
+nfsd4_lm_get_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6533,7 +6543,7 @@ nfsd4_fl_get_owner(fl_owner_t owner)
}
static void
-nfsd4_fl_put_owner(fl_owner_t owner)
+nfsd4_lm_put_owner(fl_owner_t owner)
{
struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
@@ -6568,8 +6578,8 @@ nfsd4_lm_notify(struct file_lock *fl)
static const struct lock_manager_operations nfsd_posix_mng_ops = {
.lm_notify = nfsd4_lm_notify,
- .lm_get_owner = nfsd4_fl_get_owner,
- .lm_put_owner = nfsd4_fl_put_owner,
+ .lm_get_owner = nfsd4_lm_get_owner,
+ .lm_put_owner = nfsd4_lm_put_owner,
};
static inline void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 899de438e529..da92e7d2ab6a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2854,6 +2854,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
if (err)
goto out_nfserr;
+ if (!(stat.result_mask & STATX_BTIME))
+ /* underlying FS does not offer btime so we can't share it */
+ bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
@@ -3254,6 +3257,13 @@ out_acl:
p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
*p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
+ if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
+ goto out_resource;
+ p = xdr_encode_hyper(p, (s64)stat.btime.tv_sec);
+ *p++ = cpu_to_be32(stat.btime.tv_nsec);
+ }
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
struct kstat parent_stat;
u64 ino = stat.ino;
@@ -3495,7 +3505,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
p = xdr_reserve_space(xdr, 3*4 + namlen);
if (!p)
goto fail;
- p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
+ p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */
p = xdr_encode_array(p, name, namlen); /* name length & name */
nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
@@ -3986,10 +3996,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
}
xdr_commit_encode(xdr);
- maxcount = svc_max_payload(resp->rqstp);
- maxcount = min_t(unsigned long, maxcount,
+ maxcount = min_t(unsigned long, read->rd_length,
(xdr->buf->buflen - xdr->buf->len));
- maxcount = min_t(unsigned long, maxcount, read->rd_length);
if (file->f_op->splice_read &&
test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
@@ -4826,10 +4834,8 @@ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr_resource;
xdr_commit_encode(xdr);
- maxcount = svc_max_payload(resp->rqstp);
- maxcount = min_t(unsigned long, maxcount,
+ maxcount = min_t(unsigned long, read->rd_length,
(xdr->buf->buflen - xdr->buf->len));
- maxcount = min_t(unsigned long, maxcount, read->rd_length);
count = maxcount;
eof = read->rd_offset >= i_size_read(file_inode(file));
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index a4a69ab6ab28..0b3f12aa37ff 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -84,12 +84,6 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}
-static u32
-nfsd_cache_hash(__be32 xid, struct nfsd_net *nn)
-{
- return hash_32((__force u32)xid, nn->maskbits);
-}
-
static struct svc_cacherep *
nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
struct nfsd_net *nn)
@@ -241,6 +235,14 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
list_move_tail(&rp->c_lru, &b->lru_head);
}
+static noinline struct nfsd_drc_bucket *
+nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn)
+{
+ unsigned int hash = hash_32((__force u32)xid, nn->maskbits);
+
+ return &nn->drc_hashtbl[hash];
+}
+
static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
unsigned int max)
{
@@ -419,12 +421,10 @@ out:
*/
int nfsd_cache_lookup(struct svc_rqst *rqstp)
{
- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct nfsd_net *nn;
struct svc_cacherep *rp, *found;
- __be32 xid = rqstp->rq_xid;
__wsum csum;
- u32 hash = nfsd_cache_hash(xid, nn);
- struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash];
+ struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
int rtn = RC_DOIT;
@@ -440,17 +440,16 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
* Since the common case is a cache miss followed by an insert,
* preallocate an entry.
*/
+ nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
if (!rp)
goto out;
+ b = nfsd_cache_bucket_find(rqstp->rq_xid, nn);
spin_lock(&b->cache_lock);
found = nfsd_cache_insert(b, rp, nn);
- if (found != rp) {
- nfsd_reply_cache_free_locked(NULL, rp, nn);
- rp = found;
+ if (found != rp)
goto found_entry;
- }
nfsd_stats_rc_misses_inc();
rqstp->rq_cacherep = rp;
@@ -468,8 +467,10 @@ out:
found_entry:
/* We found a matching entry which is either in progress or done. */
+ nfsd_reply_cache_free_locked(NULL, rp, nn);
nfsd_stats_rc_hits_inc();
rtn = RC_DROPIT;
+ rp = found;
/* Request being processed */
if (rp->c_state == RC_INPROG)
@@ -528,7 +529,6 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
- u32 hash;
struct nfsd_drc_bucket *b;
int len;
size_t bufsize = 0;
@@ -536,8 +536,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
if (!rp)
return;
- hash = nfsd_cache_hash(rp->c_key.k_xid, nn);
- b = &nn->drc_hashtbl[hash];
+ b = nfsd_cache_bucket_find(rp->c_key.k_xid, nn);
len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
len >>= 2;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 68b020f2002b..16920e4512bd 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -772,13 +772,13 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
if (err != 0)
return err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0)
goto out_err;
- err = svc_create_xprt(nn->nfsd_serv, transport, net,
- PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
+ err = svc_xprt_create(nn->nfsd_serv, transport, net,
+ PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
@@ -790,7 +790,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
out_close:
xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
if (xprt != NULL) {
- svc_close_xprt(xprt);
+ svc_xprt_close(xprt);
svc_xprt_put(xprt);
}
out_err:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 3e5008b475ff..4fc1fd639527 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -364,7 +364,7 @@ void nfsd_lockd_shutdown(void);
| FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
| FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
| FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
- | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
+ | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_CREATE \
| FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
#define NFSD4_SUPPORTED_ATTRS_WORD2 0
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 145208bcb9bd..c29baa03dfaf 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -611,8 +611,6 @@ out_negative:
return nfserr_serverfault;
}
-#ifdef CONFIG_NFSD_V3
-
/**
* fh_fill_pre_attrs - Fill in pre-op attributes
* @fhp: file handle to be updated
@@ -673,8 +671,6 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
}
-#endif /* CONFIG_NFSD_V3 */
-
/*
* Release a file handle.
*/
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 434930d8a946..fb9d358a267e 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -90,7 +90,6 @@ typedef struct svc_fh {
* operation
*/
int fh_flags; /* FH flags */
-#ifdef CONFIG_NFSD_V3
bool fh_post_saved; /* post-op attrs saved */
bool fh_pre_saved; /* pre-op attrs saved */
@@ -107,7 +106,6 @@ typedef struct svc_fh {
/* Post-op attributes saved in fh_unlock */
struct kstat fh_post_attr; /* full attrs after operation */
u64 fh_post_change; /* nfsv4 change; see above */
-#endif /* CONFIG_NFSD_V3 */
} svc_fh;
#define NFSD4_FH_FOREIGN (1<<0)
#define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
@@ -283,8 +281,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
}
#endif
-#ifdef CONFIG_NFSD_V3
-
/**
* fh_clear_pre_post_attrs - Reset pre/post attributes
* @fhp: file handle to be updated
@@ -327,22 +323,6 @@ static inline u64 nfsd4_change_attribute(struct kstat *stat,
extern void fh_fill_pre_attrs(struct svc_fh *fhp);
extern void fh_fill_post_attrs(struct svc_fh *fhp);
-#else /* !CONFIG_NFSD_V3 */
-
-static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
-{
-}
-
-static inline void fh_fill_pre_attrs(struct svc_fh *fhp)
-{
-}
-
-static inline void fh_fill_post_attrs(struct svc_fh *fhp)
-{
-}
-
-#endif /* !CONFIG_NFSD_V3 */
-
/*
* Lock a file handle/inode
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 18b8eb43a19b..fcdab8a8a41f 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -230,7 +230,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
unsigned long cnt = argp->len;
unsigned int nvecs;
- dprintk("nfsd: WRITE %s %d bytes at %d\n",
+ dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index b8c682b62d29..4bb5baa17040 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -117,9 +117,7 @@ static struct svc_stat nfsd_acl_svcstats = {
static const struct svc_version *nfsd_version[] = {
[2] = &nfsd_version2,
-#if defined(CONFIG_NFSD_V3)
[3] = &nfsd_version3,
-#endif
#if defined(CONFIG_NFSD_V4)
[4] = &nfsd_version4,
#endif
@@ -293,13 +291,13 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
if (!list_empty(&nn->nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
- error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
- SVC_SOCK_DEFAULTS, cred);
+ error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
+ SVC_SOCK_DEFAULTS, cred);
if (error < 0)
return error;
@@ -612,13 +610,6 @@ static int nfsd_get_default_max_blksize(void)
return ret;
}
-static const struct svc_serv_ops nfsd_thread_sv_ops = {
- .svo_shutdown = nfsd_last_thread,
- .svo_function = nfsd,
- .svo_enqueue_xprt = svc_xprt_do_enqueue,
- .svo_module = THIS_MODULE,
-};
-
void nfsd_shutdown_threads(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -657,8 +648,7 @@ int nfsd_create_serv(struct net *net)
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions(nn);
- serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- &nfsd_thread_sv_ops);
+ serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
if (serv == NULL)
return -ENOMEM;
@@ -724,7 +714,8 @@ void nfsd_put(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) {
- svc_shutdown_net(nn->nfsd_serv, net);
+ svc_xprt_destroy_all(nn->nfsd_serv, net);
+ nfsd_last_thread(nn->nfsd_serv, net);
svc_destroy(&nn->nfsd_serv->sv_refcnt);
spin_lock(&nfsd_notifier_lock);
nn->nfsd_serv = NULL;
@@ -1019,8 +1010,6 @@ out:
msleep(20);
}
- /* Release module */
- module_put_and_kthread_exit(0);
return 0;
}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c4cf56327843..242fa123e0e9 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -13,22 +13,6 @@
#include "export.h"
#include "nfsfh.h"
-#define NFSD_TRACE_PROC_ARG_FIELDS \
- __field(unsigned int, netns_ino) \
- __field(u32, xid) \
- __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
- __array(unsigned char, client, sizeof(struct sockaddr_in6))
-
-#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \
- do { \
- __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
- __entry->xid = be32_to_cpu(rqstp->rq_xid); \
- memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
- rqstp->rq_xprt->xpt_locallen); \
- memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
- rqstp->rq_xprt->xpt_remotelen); \
- } while (0);
-
#define NFSD_TRACE_PROC_RES_FIELDS \
__field(unsigned int, netns_ino) \
__field(u32, xid) \
@@ -53,16 +37,22 @@ DECLARE_EVENT_CLASS(nfsd_xdr_err_class,
),
TP_ARGS(rqstp),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_ARG_FIELDS
-
+ __field(unsigned int, netns_ino)
+ __field(u32, xid)
__field(u32, vers)
__field(u32, proc)
+ __sockaddr(server, rqstp->rq_xprt->xpt_locallen)
+ __sockaddr(client, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
- NFSD_TRACE_PROC_ARG_ASSIGNMENTS
+ const struct svc_xprt *xprt = rqstp->rq_xprt;
+ __entry->netns_ino = xprt->xpt_net->ns.inum;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
__entry->vers = rqstp->rq_vers;
__entry->proc = rqstp->rq_proc;
+ __assign_sockaddr(server, &xprt->xpt_local, xprt->xpt_locallen);
+ __assign_sockaddr(client, &xprt->xpt_remote, xprt->xpt_remotelen);
),
TP_printk("xid=0x%08x vers=%u proc=%u",
__entry->xid, __entry->vers, __entry->proc
@@ -306,14 +296,14 @@ TRACE_EVENT(nfsd_export_update,
DECLARE_EVENT_CLASS(nfsd_io_class,
TP_PROTO(struct svc_rqst *rqstp,
struct svc_fh *fhp,
- loff_t offset,
- unsigned long len),
+ u64 offset,
+ u32 len),
TP_ARGS(rqstp, fhp, offset, len),
TP_STRUCT__entry(
__field(u32, xid)
__field(u32, fh_hash)
- __field(loff_t, offset)
- __field(unsigned long, len)
+ __field(u64, offset)
+ __field(u32, len)
),
TP_fast_assign(
__entry->xid = be32_to_cpu(rqstp->rq_xid);
@@ -321,7 +311,7 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
__entry->offset = offset;
__entry->len = len;
),
- TP_printk("xid=0x%08x fh_hash=0x%08x offset=%lld len=%lu",
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu len=%u",
__entry->xid, __entry->fh_hash,
__entry->offset, __entry->len)
)
@@ -330,8 +320,8 @@ DECLARE_EVENT_CLASS(nfsd_io_class,
DEFINE_EVENT(nfsd_io_class, nfsd_##name, \
TP_PROTO(struct svc_rqst *rqstp, \
struct svc_fh *fhp, \
- loff_t offset, \
- unsigned long len), \
+ u64 offset, \
+ u32 len), \
TP_ARGS(rqstp, fhp, offset, len))
DEFINE_NFSD_IO_EVENT(read_start);
@@ -613,20 +603,21 @@ TRACE_EVENT(nfsd_clid_cred_mismatch,
__field(u32, cl_id)
__field(unsigned long, cl_flavor)
__field(unsigned long, new_flavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->cl_flavor = clp->cl_cred.cr_flavor;
__entry->new_flavor = rqstp->rq_cred.cr_flavor;
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
show_nfsd_authflavor(__entry->cl_flavor),
- show_nfsd_authflavor(__entry->new_flavor), __entry->addr
+ show_nfsd_authflavor(__entry->new_flavor),
+ __get_sockaddr(addr)
)
)
@@ -642,7 +633,7 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
__field(u32, cl_id)
__array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE)
__array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, rqstp->rq_xprt->xpt_remotelen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -651,14 +642,14 @@ TRACE_EVENT(nfsd_clid_verf_mismatch,
NFS4_VERIFIER_SIZE);
memcpy(__entry->new_verifier, (void *)verf,
NFS4_VERIFIER_SIZE);
- memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &rqstp->rq_xprt->xpt_remote,
+ rqstp->rq_xprt->xpt_remotelen);
),
TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc",
__entry->cl_boot, __entry->cl_id,
__print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE),
__print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE),
- __entry->addr
+ __get_sockaddr(addr)
)
);
@@ -908,18 +899,17 @@ TRACE_EVENT(nfsd_cb_args,
__field(u32, cl_id)
__field(u32, prog)
__field(u32, ident)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, conn->cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->prog = conn->cb_prog;
__entry->ident = conn->cb_ident;
- memcpy(__entry->addr, &conn->cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &conn->cb_addr, conn->cb_addrlen);
),
TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->prog, __entry->ident)
);
@@ -951,17 +941,17 @@ DECLARE_EVENT_CLASS(nfsd_cb_class,
__field(unsigned long, state)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->state = clp->cl_cb_state;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x state=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
show_cb_state(__entry->state))
);
@@ -1001,7 +991,7 @@ TRACE_EVENT(nfsd_cb_setup,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(unsigned long, authflavor)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
__array(unsigned char, netid, 8)
),
TP_fast_assign(
@@ -1009,11 +999,11 @@ TRACE_EVENT(nfsd_cb_setup,
__entry->cl_id = clp->cl_clientid.cl_id;
strlcpy(__entry->netid, netid, sizeof(__entry->netid));
__entry->authflavor = authflavor;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->netid, show_nfsd_authflavor(__entry->authflavor))
);
@@ -1027,30 +1017,32 @@ TRACE_EVENT(nfsd_cb_setup_err,
__field(long, error)
__field(u32, cl_boot)
__field(u32, cl_id)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->error = error;
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x error=%ld",
- __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+ __entry->error)
);
-TRACE_EVENT(nfsd_cb_recall,
+TRACE_EVENT_CONDITION(nfsd_cb_recall,
TP_PROTO(
const struct nfs4_stid *stid
),
TP_ARGS(stid),
+ TP_CONDITION(stid->sc_client),
TP_STRUCT__entry(
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, si_id)
__field(u32, si_generation)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, stid->sc_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const stateid_t *stp = &stid->sc_stateid;
@@ -1060,14 +1052,11 @@ TRACE_EVENT(nfsd_cb_recall,
__entry->cl_id = stp->si_opaque.so_clid.cl_id;
__entry->si_id = stp->si_opaque.so_id;
__entry->si_generation = stp->si_generation;
- if (clp)
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
- else
- memset(__entry->addr, 0, sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation)
);
@@ -1081,7 +1070,7 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__field(u32, cl_boot)
__field(u32, cl_id)
__field(u32, fh_hash)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, lo->lo_owner.so_client->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
const struct nfs4_client *clp = lo->lo_owner.so_client;
@@ -1089,11 +1078,11 @@ TRACE_EVENT(nfsd_cb_notify_lock,
__entry->cl_boot = clp->cl_clientid.cl_boot;
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh);
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->fh_hash)
);
@@ -1114,7 +1103,7 @@ TRACE_EVENT(nfsd_cb_offload,
__field(u32, fh_hash)
__field(int, status)
__field(u64, count)
- __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ __sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
),
TP_fast_assign(
__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
@@ -1124,11 +1113,11 @@ TRACE_EVENT(nfsd_cb_offload,
__entry->fh_hash = knfsd_fh_hash(fh);
__entry->status = be32_to_cpu(status);
__entry->count = count;
- memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
- sizeof(struct sockaddr_in6));
+ __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+ clp->cl_cb_conn.cb_addrlen)
),
TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d",
- __entry->addr, __entry->cl_boot, __entry->cl_id,
+ __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
__entry->si_id, __entry->si_generation,
__entry->fh_hash, __entry->count, __entry->status)
);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 99c2b9dfbb10..c22ad0532e8e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -26,15 +26,14 @@
#include <linux/xattr.h>
#include <linux/jhash.h>
#include <linux/ima.h>
+#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>
-#ifdef CONFIG_NFSD_V3
#include "xdr3.h"
-#endif /* CONFIG_NFSD_V3 */
#ifdef CONFIG_NFSD_V4
#include "../internal.h"
@@ -435,6 +434,10 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
.ia_size = iap->ia_size,
};
+ host_err = -EFBIG;
+ if (iap->ia_size < 0)
+ goto out_unlock;
+
host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL);
if (host_err)
goto out_unlock;
@@ -604,7 +607,6 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
#endif /* defined(CONFIG_NFSD_V4) */
-#ifdef CONFIG_NFSD_V3
/*
* Check server access rights to a file system object
*/
@@ -716,7 +718,6 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
out:
return error;
}
-#endif /* CONFIG_NFSD_V3 */
int nfsd_open_break_lease(struct inode *inode, int access)
{
@@ -1109,43 +1110,61 @@ out:
return err;
}
-#ifdef CONFIG_NFSD_V3
-/*
- * Commit all pending writes to stable storage.
+/**
+ * nfsd_commit - Commit pending writes to stable storage
+ * @rqstp: RPC request being processed
+ * @fhp: NFS filehandle
+ * @offset: raw offset from beginning of file
+ * @count: raw count of bytes to sync
+ * @verf: filled in with the server's current write verifier
*
- * Note: we only guarantee that data that lies within the range specified
- * by the 'offset' and 'count' parameters will be synced.
+ * Note: we guarantee that data that lies within the range specified
+ * by the 'offset' and 'count' parameters will be synced. The server
+ * is permitted to sync data that lies outside this range at the
+ * same time.
*
* Unfortunately we cannot lock the file to make sure we return full WCC
* data to the client, as locking happens lower down in the filesystem.
+ *
+ * Return values:
+ * An nfsstat value in network byte order.
*/
__be32
-nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
- loff_t offset, unsigned long count, __be32 *verf)
+nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, u64 offset,
+ u32 count, __be32 *verf)
{
+ u64 maxbytes;
+ loff_t start, end;
struct nfsd_net *nn;
struct nfsd_file *nf;
- loff_t end = LLONG_MAX;
- __be32 err = nfserr_inval;
-
- if (offset < 0)
- goto out;
- if (count != 0) {
- end = offset + (loff_t)count - 1;
- if (end < offset)
- goto out;
- }
+ __be32 err;
err = nfsd_file_acquire(rqstp, fhp,
NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
if (err)
goto out;
+
+ /*
+ * Convert the client-provided (offset, count) range to a
+ * (start, end) range. If the client-provided range falls
+ * outside the maximum file size of the underlying FS,
+ * clamp the sync range appropriately.
+ */
+ start = 0;
+ end = LLONG_MAX;
+ maxbytes = (u64)fhp->fh_dentry->d_sb->s_maxbytes;
+ if (offset < maxbytes) {
+ start = offset;
+ if (count && (offset + count - 1 < maxbytes))
+ end = offset + count - 1;
+ }
+
nn = net_generic(nf->nf_net, nfsd_net_id);
if (EX_ISSYNC(fhp->fh_export)) {
errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
int err2;
- err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
+ err2 = vfs_fsync_range(nf->nf_file, start, end, 0);
switch (err2) {
case 0:
nfsd_copy_write_verifier(verf, nn);
@@ -1167,7 +1186,6 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
out:
return err;
}
-#endif /* CONFIG_NFSD_V3 */
static __be32
nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
@@ -1357,8 +1375,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
rdev, resfhp);
}
-#ifdef CONFIG_NFSD_V3
-
/*
* NFSv3 and NFSv4 version of nfsd_create
*/
@@ -1524,7 +1540,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserrno(host_err);
goto out;
}
-#endif /* CONFIG_NFSD_V3 */
/*
* Read a symlink. On entry, *lenp must contain the maximum path length that
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9f56dcb22ff7..ccb87b2864f6 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -68,15 +68,13 @@ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
-#ifdef CONFIG_NFSD_V3
__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
struct svc_fh *res, int createmode,
u32 *verifier, bool *truncp, bool *created);
-__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
- loff_t, unsigned long, __be32 *verf);
-#endif /* CONFIG_NFSD_V3 */
+__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
+ u64 offset, u32 count, __be32 *verf);
#ifdef CONFIG_NFSD_V4
__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *name, void **bufp, int *lenp);
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 528fb299430e..852f71580bd0 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -32,7 +32,7 @@ struct nfsd_readargs {
struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
- int len;
+ __u32 len;
struct xdr_buf payload;
};
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index e3d807d5b83a..476a4a649f38 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -199,23 +199,22 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
return 0;
}
-static int nilfs_set_page_dirty(struct page *page)
+static bool nilfs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- int ret = __set_page_dirty_nobuffers(page);
+ struct inode *inode = mapping->host;
+ struct buffer_head *head;
+ unsigned int nr_dirty = 0;
+ bool ret = filemap_dirty_folio(mapping, folio);
- if (page_has_buffers(page)) {
- unsigned int nr_dirty = 0;
- struct buffer_head *bh, *head;
+ /*
+ * The page may not be locked, eg if called from try_to_unmap_one()
+ */
+ spin_lock(&mapping->private_lock);
+ head = folio_buffers(folio);
+ if (head) {
+ struct buffer_head *bh = head;
- /*
- * This page is locked by callers, and no other thread
- * concurrently marks its buffers dirty since they are
- * only dirtied through routines in fs/buffer.c in
- * which call sites of mark_buffer_dirty are protected
- * by page lock.
- */
- bh = head = page_buffers(page);
do {
/* Do not mark hole blocks dirty */
if (buffer_dirty(bh) || !buffer_mapped(bh))
@@ -224,14 +223,13 @@ static int nilfs_set_page_dirty(struct page *page)
set_buffer_dirty(bh);
nr_dirty++;
} while (bh = bh->b_this_page, bh != head);
-
- if (nr_dirty)
- nilfs_set_file_dirty(inode, nr_dirty);
} else if (ret) {
- unsigned int nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
+ nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits);
+ }
+ spin_unlock(&mapping->private_lock);
+ if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
- }
return ret;
}
@@ -299,12 +297,12 @@ const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
.readpage = nilfs_readpage,
.writepages = nilfs_writepages,
- .set_page_dirty = nilfs_set_page_dirty,
+ .dirty_folio = nilfs_dirty_folio,
.readahead = nilfs_readahead,
.write_begin = nilfs_write_begin,
.write_end = nilfs_write_end,
/* .releasepage = nilfs_releasepage, */
- .invalidatepage = block_invalidatepage,
+ .invalidate_folio = block_invalidate_folio,
.direct_IO = nilfs_direct_IO,
.is_partially_uptodate = block_is_partially_uptodate,
};
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 4b3d33cf0041..78db33decd72 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -434,7 +434,8 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
static const struct address_space_operations def_mdt_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.writepage = nilfs_mdt_write_page,
};
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index a3bb0c856ec8..1362ccb64ec7 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -340,18 +340,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
struct nilfs_write_info *wi)
{
struct bio *bio = wi->bio;
- int err;
-
- if (segbuf->sb_nbio > 0 &&
- bdi_write_congested(segbuf->sb_super->s_bdi)) {
- wait_for_completion(&segbuf->sb_bio_event);
- segbuf->sb_nbio--;
- if (unlikely(atomic_read(&segbuf->sb_err))) {
- bio_put(bio);
- err = -EIO;
- goto failed;
- }
- }
bio->bi_end_io = nilfs_end_bio_write;
bio->bi_private = segbuf;
@@ -363,10 +351,6 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf,
wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
wi->start = wi->end;
return 0;
-
- failed:
- wi->bio = NULL;
- return err;
}
static void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 63e5fa74016c..3e05c98631ec 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -151,7 +151,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
{
struct nilfs_inode_info *ii;
- ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
+ ii = alloc_inode_sb(sb, nilfs_inode_cachep, GFP_NOFS);
if (!ii)
return NULL;
ii->i_bh = NULL;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 1026f67b1d1e..9b32b76a9c30 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -701,9 +701,6 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (fanotify_is_perm_event(event->mask))
FANOTIFY_PERM(event)->fd = fd;
- if (f)
- fd_install(fd, f);
-
if (info_mode) {
ret = copy_info_records_to_user(event, info, info_mode, pidfd,
buf, count);
@@ -711,6 +708,9 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
goto out_close_fd;
}
+ if (f)
+ fd_install(fd, f);
+
return metadata.event_len;
out_close_fd:
@@ -1003,17 +1003,18 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
__u32 mask, unsigned int flags,
__u32 umask, int *destroy)
{
- __u32 oldmask = 0;
+ __u32 oldmask, newmask;
/* umask bits cannot be removed by user */
mask &= ~umask;
spin_lock(&fsn_mark->lock);
+ oldmask = fsnotify_calc_mask(fsn_mark);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- oldmask = fsn_mark->mask;
fsn_mark->mask &= ~mask;
} else {
fsn_mark->ignored_mask &= ~mask;
}
+ newmask = fsnotify_calc_mask(fsn_mark);
/*
* We need to keep the mark around even if remaining mask cannot
* result in any events (e.g. mask == FAN_ONDIR) to support incremenal
@@ -1023,7 +1024,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
*destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
spin_unlock(&fsn_mark->lock);
- return mask & oldmask;
+ return oldmask & ~newmask;
}
static int fanotify_remove_mark(struct fsnotify_group *group,
@@ -1080,24 +1081,42 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
flags, umask);
}
+static void fanotify_mark_add_ignored_mask(struct fsnotify_mark *fsn_mark,
+ __u32 mask, unsigned int flags,
+ __u32 *removed)
+{
+ fsn_mark->ignored_mask |= mask;
+
+ /*
+ * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
+ * the removal of the FS_MODIFY bit in calculated mask if it was set
+ * because of an ignored mask that is now going to survive FS_MODIFY.
+ */
+ if ((flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
+ fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+ if (!(fsn_mark->mask & FS_MODIFY))
+ *removed = FS_MODIFY;
+ }
+}
+
static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
- __u32 mask,
- unsigned int flags)
+ __u32 mask, unsigned int flags,
+ __u32 *removed)
{
- __u32 oldmask = -1;
+ __u32 oldmask, newmask;
spin_lock(&fsn_mark->lock);
+ oldmask = fsnotify_calc_mask(fsn_mark);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
- oldmask = fsn_mark->mask;
fsn_mark->mask |= mask;
} else {
- fsn_mark->ignored_mask |= mask;
- if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
- fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+ fanotify_mark_add_ignored_mask(fsn_mark, mask, flags, removed);
}
+ newmask = fsnotify_calc_mask(fsn_mark);
spin_unlock(&fsn_mark->lock);
- return mask & ~oldmask;
+ return newmask & ~oldmask;
}
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
@@ -1155,7 +1174,7 @@ static int fanotify_add_mark(struct fsnotify_group *group,
__kernel_fsid_t *fsid)
{
struct fsnotify_mark *fsn_mark;
- __u32 added;
+ __u32 added, removed = 0;
int ret = 0;
mutex_lock(&group->mark_mutex);
@@ -1178,8 +1197,8 @@ static int fanotify_add_mark(struct fsnotify_group *group,
goto out;
}
- added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
- if (added & ~fsnotify_conn_mask(fsn_mark->connector))
+ added = fanotify_mark_add_to_mask(fsn_mark, mask, flags, &removed);
+ if (removed || (added & ~fsnotify_conn_mask(fsn_mark->connector)))
fsnotify_recalc_mask(fsn_mark->connector);
out:
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ab81a0776ece..70a8516b78bc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -70,8 +70,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
- if (iput_inode)
- iput(iput_inode);
+ iput(iput_inode);
/* for each watch, send FS_UNMOUNT and then remove it */
fsnotify_inode(inode, FS_UNMOUNT);
@@ -85,8 +84,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
}
spin_unlock(&sb->s_inode_list_lock);
- if (iput_inode)
- iput(iput_inode);
+ iput(iput_inode);
}
void fsnotify_sb_delete(struct super_block *sb)
@@ -531,11 +529,13 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
/*
- * if this is a modify event we may need to clear the ignored masks
- * otherwise return if none of the marks care about this type of event.
+ * If this is a modify event we may need to clear some ignored masks.
+ * In that case, the object with ignored masks will have the FS_MODIFY
+ * event in its mask.
+ * Otherwise, return if none of the marks care about this type of event.
*/
test_mask = (mask & ALL_FSNOTIFY_EVENTS);
- if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
+ if (!(test_mask & marks_mask))
return 0;
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 9007d6affff3..4853184f7dde 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -127,7 +127,7 @@ static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
return;
hlist_for_each_entry(mark, &conn->list, obj_list) {
if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
- new_mask |= mark->mask;
+ new_mask |= fsnotify_calc_mask(mark);
}
*fsnotify_conn_mask_p(conn) = new_mask;
}
@@ -692,7 +692,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
if (ret)
goto err;
- if (mark->mask)
+ if (mark->mask || mark->ignored_mask)
fsnotify_recalc_mask(mark->connector);
return ret;
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index bb0a43860ad2..d154dcfe06af 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -593,12 +593,12 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
iblock = initialized_size >> blocksize_bits;
/*
- * Be very careful. We have no exclusion from __set_page_dirty_buffers
+ * Be very careful. We have no exclusion from block_dirty_folio
* here, and the (potentially unmapped) buffers may become dirty at
* any time. If a buffer becomes dirty here after we've inspected it
* then we just miss that fact, and the page stays dirty.
*
- * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
+ * Buffers outside i_size may be dirtied by block_dirty_folio;
* handle that here by just cleaning them.
*/
@@ -653,7 +653,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
// Update initialized size in the attribute and
// in the inode.
// Again, for each page do:
- // __set_page_dirty_buffers();
+ // block_dirty_folio();
// put_page()
// We don't need to wait on the writes.
// Update iblock.
@@ -1350,12 +1350,13 @@ retry_writepage:
/* Is the page fully outside i_size? (truncate in progress) */
if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
PAGE_SHIFT)) {
+ struct folio *folio = page_folio(page);
/*
* The page may have dirty, unmapped buffers. Make them
* freeable here, so the page does not leak.
*/
- block_invalidatepage(page, 0, PAGE_SIZE);
- unlock_page(page);
+ block_invalidate_folio(folio, 0, folio_size(folio));
+ folio_unlock(folio);
ntfs_debug("Write outside i_size - truncated?");
return 0;
}
@@ -1653,7 +1654,7 @@ const struct address_space_operations ntfs_normal_aops = {
.readpage = ntfs_readpage,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.bmap = ntfs_bmap,
.migratepage = buffer_migrate_page,
@@ -1668,7 +1669,7 @@ const struct address_space_operations ntfs_compressed_aops = {
.readpage = ntfs_readpage,
#ifdef NTFS_RW
.writepage = ntfs_writepage,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
#endif /* NTFS_RW */
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1683,9 +1684,7 @@ const struct address_space_operations ntfs_mst_aops = {
.readpage = ntfs_readpage, /* Fill page with data. */
#ifdef NTFS_RW
.writepage = ntfs_writepage, /* Write dirty page to disk. */
- .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
- without touching the buffers
- belonging to the page. */
+ .dirty_folio = filemap_dirty_folio,
#endif /* NTFS_RW */
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
@@ -1747,7 +1746,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
set_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
spin_unlock(&mapping->private_lock);
- __set_page_dirty_nobuffers(page);
+ block_dirty_folio(mapping, page_folio(page));
if (unlikely(buffers_to_free)) {
do {
bh = buffers_to_free->b_this_page;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 4474adb393ca..efe0602b4e51 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -310,7 +310,7 @@ struct inode *ntfs_alloc_big_inode(struct super_block *sb)
ntfs_inode *ni;
ntfs_debug("Entering.");
- ni = kmem_cache_alloc(ntfs_big_inode_cache, GFP_NOFS);
+ ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
if (likely(ni != NULL)) {
ni->state = 0;
return VFS_I(ni);
@@ -1881,6 +1881,10 @@ int ntfs_read_inode_mount(struct inode *vi)
}
/* Now allocate memory for the attribute list. */
ni->attr_list_size = (u32)ntfs_attr_size(a);
+ if (!ni->attr_list_size) {
+ ntfs_error(sb, "Attr_list_size is zero");
+ goto put_err_out;
+ }
ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
if (!ni->attr_list) {
ntfs_error(sb, "Not enough memory to allocate buffer "
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index a87ab3ad3cd3..9eab11e3b034 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -1950,7 +1950,7 @@ const struct address_space_operations ntfs_aops = {
.write_end = ntfs_write_end,
.direct_IO = ntfs_direct_IO,
.bmap = ntfs_bmap,
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
};
const struct address_space_operations ntfs_aops_cmpr = {
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 29813200c7af..278dcf502410 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -399,7 +399,7 @@ static struct kmem_cache *ntfs_inode_cachep;
static struct inode *ntfs_alloc_inode(struct super_block *sb)
{
- struct ntfs_inode *ni = kmem_cache_alloc(ntfs_inode_cachep, GFP_NOFS);
+ struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS);
if (!ni)
return NULL;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index bf9357123bc5..49f41074baad 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5981,7 +5981,7 @@ bail:
return status;
}
-/* Expects you to already be holding tl_inode->i_mutex */
+/* Expects you to already be holding tl_inode->i_rwsem */
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
{
int status;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 498da317580a..4b9af65cb61b 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2311,7 +2311,7 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
down_write(&oi->ip_alloc_sem);
- /* Delete orphan before acquire i_mutex. */
+ /* Delete orphan before acquire i_rwsem. */
if (dwc->dw_orphaned) {
BUG_ON(dwc->dw_writer_pid != task_pid_nr(current));
@@ -2453,7 +2453,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
}
const struct address_space_operations ocfs2_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
.readpage = ocfs2_readpage,
.readahead = ocfs2_readahead,
.writepage = ocfs2_writepage,
@@ -2461,7 +2461,7 @@ const struct address_space_operations ocfs2_aops = {
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = block_invalidatepage,
+ .invalidate_folio = block_invalidate_folio,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 625c92521416..27fee68f860a 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -689,7 +689,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
struct o2nm_node_group *ns = NULL;
struct config_group *o2hb_group = NULL, *ret = NULL;
- /* this runs under the parent dir's i_mutex; there can be only
+ /* this runs under the parent dir's i_rwsem; there can be only
* one caller in here at a time */
if (o2nm_single_cluster)
return ERR_PTR(-ENOSPC);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f2cc1ff29e6d..81c3d65d68fe 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1957,7 +1957,7 @@ bail_nolock:
}
/*
- * NOTE: this should always be called with parent dir i_mutex taken.
+ * NOTE: this should always be called with parent dir i_rwsem taken.
*/
int ocfs2_find_files_on_disk(const char *name,
int namelen,
@@ -2003,7 +2003,7 @@ int ocfs2_lookup_ino_from_name(struct inode *dir, const char *name,
* Return 0 if the name does not exist
* Return -EEXIST if the directory contains the name
*
- * Callers should have i_mutex + a cluster lock on dir
+ * Callers should have i_rwsem + a cluster lock on dir
*/
int ocfs2_check_dir_for_entry(struct inode *dir,
const char *name,
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index fa0a14f199eb..e360543ad7e7 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -280,7 +280,7 @@ static struct inode *dlmfs_alloc_inode(struct super_block *sb)
{
struct dlmfs_inode_private *ip;
- ip = kmem_cache_alloc(dlmfs_inode_cache, GFP_NOFS);
+ ip = alloc_inode_sb(sb, dlmfs_inode_cache, GFP_NOFS);
if (!ip)
return NULL;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fc5f780fa235..01b7407a8893 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -270,7 +270,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
/*
* Don't use ocfs2_mark_inode_dirty() here as we don't always
- * have i_mutex to guard against concurrent changes to other
+ * have i_rwsem to guard against concurrent changes to other
* inode fields.
*/
inode->i_atime = current_time(inode);
@@ -540,15 +540,12 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret)
{
- int ret;
struct ocfs2_extent_tree et;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), fe_bh);
- ret = ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
- clusters_to_add, mark_unwritten,
- data_ac, meta_ac, reason_ret);
-
- return ret;
+ return ocfs2_add_clusters_in_btree(handle, &et, logical_offset,
+ clusters_to_add, mark_unwritten,
+ data_ac, meta_ac, reason_ret);
}
static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
@@ -1068,7 +1065,7 @@ static int ocfs2_extend_file(struct inode *inode,
/*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
- * i_mutex to block other extend/truncate calls while we're
+ * i_rwsem to block other extend/truncate calls while we're
* here. We even have to hold it for sparse files because there
* might be some tail zeroing.
*/
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 6c2411c2afcf..5739dc301569 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -713,7 +713,7 @@ bail:
/*
* Serialize with orphan dir recovery. If the process doing
* recovery on this orphan dir does an iget() with the dir
- * i_mutex held, we'll deadlock here. Instead we detect this
+ * i_rwsem held, we'll deadlock here. Instead we detect this
* and exit early - recovery will wipe this inode for us.
*/
static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5f6bacbeef6b..c4426d12a2ad 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -606,7 +606,7 @@ out:
/*
* make sure we've got at least bits_wanted contiguous bits in the
- * local alloc. You lose them when you drop i_mutex.
+ * local alloc. You lose them when you drop i_rwsem.
*
* We will add ourselves to the transaction passed in, but may start
* our own in order to shift windows.
@@ -636,7 +636,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
/*
* We must double check state and allocator bits because
- * another process may have changed them while holding i_mutex.
+ * another process may have changed them while holding i_rwsem.
*/
spin_lock(&osb->osb_lock);
if (!ocfs2_la_state_enabled(osb) ||
@@ -1029,7 +1029,7 @@ enum ocfs2_la_event {
/*
* Given an event, calculate the size of our next local alloc window.
*
- * This should always be called under i_mutex of the local alloc inode
+ * This should always be called under i_rwsem of the local alloc inode
* so that local alloc disabling doesn't race with processes trying to
* use the allocator.
*
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2c46ff6ba4ea..c75fd54b9185 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -476,7 +476,7 @@ leave:
ocfs2_free_alloc_context(meta_ac);
/*
- * We should call iput after the i_mutex of the bitmap been
+ * We should call iput after the i_rwsem of the bitmap been
* unlocked in ocfs2_free_alloc_context, or the
* ocfs2_delete_inode will mutex_lock again.
*/
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index bb62cc2e0211..337527571461 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -355,7 +355,7 @@ struct ocfs2_super
struct delayed_work la_enable_wq;
/*
- * Must hold local alloc i_mutex and osb->osb_lock to change
+ * Must hold local alloc i_rwsem and osb->osb_lock to change
* local_alloc_bits. Reads can be done under either lock.
*/
unsigned int local_alloc_bits;
@@ -430,7 +430,7 @@ struct ocfs2_super
atomic_t osb_tl_disable;
/*
* How many clusters in our truncate log.
- * It must be protected by osb_tl_inode->i_mutex.
+ * It must be protected by osb_tl_inode->i_rwsem.
*/
unsigned int truncated_clusters;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f033de733adb..273f65e0aaba 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -36,7 +36,7 @@
* should be obeyed by all the functions:
* - any write of quota structure (either to local or global file) is protected
* by dqio_sem or dquot->dq_lock.
- * - any modification of global quota file holds inode cluster lock, i_mutex,
+ * - any modification of global quota file holds inode cluster lock, i_rwsem,
* and ip_alloc_sem of the global quota file (achieved by
* ocfs2_lock_global_qf). It also has to hold qinfo_lock.
* - an allocation of new blocks for local quota file is protected by
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 85a47621e0c0..a75e2b7d67f5 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -683,28 +683,22 @@ static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
void *name,
unsigned int namelen)
{
- int ret;
-
if (!lksb->lksb_fsdlm.sb_lvbptr)
lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
sizeof(struct dlm_lksb);
- ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
- flags|DLM_LKF_NODLCKWT, name, namelen, 0,
- fsdlm_lock_ast_wrapper, lksb,
- fsdlm_blocking_ast_wrapper);
- return ret;
+ return dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
+ flags|DLM_LKF_NODLCKWT, name, namelen, 0,
+ fsdlm_lock_ast_wrapper, lksb,
+ fsdlm_blocking_ast_wrapper);
}
static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
struct ocfs2_dlm_lksb *lksb,
u32 flags)
{
- int ret;
-
- ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
- flags, &lksb->lksb_fsdlm, lksb);
- return ret;
+ return dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
+ flags, &lksb->lksb_fsdlm, lksb);
}
static int user_dlm_lock_status(struct ocfs2_dlm_lksb *lksb)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2772dec9dcea..477cdf94122e 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -548,7 +548,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
{
struct ocfs2_inode_info *oi;
- oi = kmem_cache_alloc(ocfs2_inode_cachep, GFP_NOFS);
+ oi = alloc_inode_sb(sb, ocfs2_inode_cachep, GFP_NOFS);
if (!oi)
return NULL;
@@ -1105,17 +1105,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
- root = d_make_root(inode);
- if (!root) {
- status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
- }
-
- sb->s_root = root;
-
- ocfs2_complete_mount_recovery(osb);
-
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
&ocfs2_kset->kobj);
if (!osb->osb_dev_kset) {
@@ -1133,6 +1122,17 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
goto read_super_error;
}
+ root = d_make_root(inode);
+ if (!root) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto read_super_error;
+ }
+
+ sb->s_root = root;
+
+ ocfs2_complete_mount_recovery(osb);
+
if (ocfs2_mount_local(osb))
snprintf(nodestr, sizeof(nodestr), "local");
else
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd784eb0cd7c..95d0611c5fc7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7205,7 +7205,7 @@ out:
* Used for reflink a non-preserve-security file.
*
* It uses common api like ocfs2_xattr_set, so the caller
- * must not hold any lock expect i_mutex.
+ * must not hold any lock expect i_rwsem.
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 89725b15a64b..3f297b541713 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,8 @@ const struct inode_operations omfs_file_inops = {
};
const struct address_space_operations omfs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = omfs_readpage,
.readahead = omfs_readahead,
.writepage = omfs_writepage,
diff --git a/fs/open.c b/fs/open.c
index 9ff2f621b760..1315253e0247 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -835,7 +835,6 @@ static int do_dentry_open(struct file *f,
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
- f->f_write_hint = WRITE_LIFE_NOT_SET;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index f825176ff4ed..f0b7f4d51a17 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -335,7 +335,7 @@ static struct inode *openprom_alloc_inode(struct super_block *sb)
{
struct op_inode_info *oi;
- oi = kmem_cache_alloc(op_inode_cachep, GFP_KERNEL);
+ oi = alloc_inode_sb(sb, op_inode_cachep, GFP_KERNEL);
if (!oi)
return NULL;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index e5e3e500ed46..79c1025d18ea 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -46,7 +46,7 @@ static int orangefs_writepage_locked(struct page *page,
else
wlen = PAGE_SIZE;
}
- /* Should've been handled in orangefs_invalidatepage. */
+ /* Should've been handled in orangefs_invalidate_folio. */
WARN_ON(off == len || off + wlen > len);
bv.bv_page = page;
@@ -243,7 +243,7 @@ static int orangefs_writepages(struct address_space *mapping,
return ret;
}
-static int orangefs_launder_page(struct page *);
+static int orangefs_launder_folio(struct folio *);
static void orangefs_readahead(struct readahead_control *rac)
{
@@ -290,14 +290,15 @@ static void orangefs_readahead(struct readahead_control *rac)
static int orangefs_readpage(struct file *file, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = page->mapping->host;
struct iov_iter iter;
struct bio_vec bv;
ssize_t ret;
loff_t off; /* offset into this page */
- if (PageDirty(page))
- orangefs_launder_page(page);
+ if (folio_test_dirty(folio))
+ orangefs_launder_folio(folio);
off = page_offset(page);
bv.bv_page = page;
@@ -330,6 +331,7 @@ static int orangefs_write_begin(struct file *file,
void **fsdata)
{
struct orangefs_write_range *wr;
+ struct folio *folio;
struct page *page;
pgoff_t index;
int ret;
@@ -341,27 +343,28 @@ static int orangefs_write_begin(struct file *file,
return -ENOMEM;
*pagep = page;
+ folio = page_folio(page);
- if (PageDirty(page) && !PagePrivate(page)) {
+ if (folio_test_dirty(folio) && !folio_test_private(folio)) {
/*
* Should be impossible. If it happens, launder the page
* since we don't know what's dirty. This will WARN in
* orangefs_writepage_locked.
*/
- ret = orangefs_launder_page(page);
+ ret = orangefs_launder_folio(folio);
if (ret)
return ret;
}
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
struct orangefs_write_range *wr;
- wr = (struct orangefs_write_range *)page_private(page);
+ wr = folio_get_private(folio);
if (wr->pos + wr->len == pos &&
uid_eq(wr->uid, current_fsuid()) &&
gid_eq(wr->gid, current_fsgid())) {
wr->len += len;
goto okay;
} else {
- ret = orangefs_launder_page(page);
+ ret = orangefs_launder_folio(folio);
if (ret)
return ret;
}
@@ -375,7 +378,7 @@ static int orangefs_write_begin(struct file *file,
wr->len = len;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- attach_page_private(page, wr);
+ folio_attach_private(folio, wr);
okay:
return 0;
}
@@ -415,47 +418,45 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-static void orangefs_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+static void orangefs_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- struct orangefs_write_range *wr;
- wr = (struct orangefs_write_range *)page_private(page);
+ struct orangefs_write_range *wr = folio_get_private(folio);
if (offset == 0 && length == PAGE_SIZE) {
- kfree(detach_page_private(page));
+ kfree(folio_detach_private(folio));
return;
/* write range entirely within invalidate range (or equal) */
- } else if (page_offset(page) + offset <= wr->pos &&
- wr->pos + wr->len <= page_offset(page) + offset + length) {
- kfree(detach_page_private(page));
+ } else if (folio_pos(folio) + offset <= wr->pos &&
+ wr->pos + wr->len <= folio_pos(folio) + offset + length) {
+ kfree(folio_detach_private(folio));
/* XXX is this right? only caller in fs */
- cancel_dirty_page(page);
+ folio_cancel_dirty(folio);
return;
/* invalidate range chops off end of write range */
- } else if (wr->pos < page_offset(page) + offset &&
- wr->pos + wr->len <= page_offset(page) + offset + length &&
- page_offset(page) + offset < wr->pos + wr->len) {
+ } else if (wr->pos < folio_pos(folio) + offset &&
+ wr->pos + wr->len <= folio_pos(folio) + offset + length &&
+ folio_pos(folio) + offset < wr->pos + wr->len) {
size_t x;
- x = wr->pos + wr->len - (page_offset(page) + offset);
+ x = wr->pos + wr->len - (folio_pos(folio) + offset);
WARN_ON(x > wr->len);
wr->len -= x;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
/* invalidate range chops off beginning of write range */
- } else if (page_offset(page) + offset <= wr->pos &&
- page_offset(page) + offset + length < wr->pos + wr->len &&
- wr->pos < page_offset(page) + offset + length) {
+ } else if (folio_pos(folio) + offset <= wr->pos &&
+ folio_pos(folio) + offset + length < wr->pos + wr->len &&
+ wr->pos < folio_pos(folio) + offset + length) {
size_t x;
- x = page_offset(page) + offset + length - wr->pos;
+ x = folio_pos(folio) + offset + length - wr->pos;
WARN_ON(x > wr->len);
wr->pos += x;
wr->len -= x;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
/* invalidate range entirely within write range (punch hole) */
- } else if (wr->pos < page_offset(page) + offset &&
- page_offset(page) + offset + length < wr->pos + wr->len) {
+ } else if (wr->pos < folio_pos(folio) + offset &&
+ folio_pos(folio) + offset + length < wr->pos + wr->len) {
/* XXX what do we do here... should not WARN_ON */
WARN_ON(1);
/* punch hole */
@@ -467,11 +468,11 @@ static void orangefs_invalidatepage(struct page *page,
/* non-overlapping ranges */
} else {
/* WARN if they do overlap */
- if (!((page_offset(page) + offset + length <= wr->pos) ^
- (wr->pos + wr->len <= page_offset(page) + offset))) {
+ if (!((folio_pos(folio) + offset + length <= wr->pos) ^
+ (wr->pos + wr->len <= folio_pos(folio) + offset))) {
WARN_ON(1);
- printk("invalidate range offset %llu length %u\n",
- page_offset(page) + offset, length);
+ printk("invalidate range offset %llu length %zu\n",
+ folio_pos(folio) + offset, length);
printk("write range offset %llu length %zu\n",
wr->pos, wr->len);
}
@@ -483,7 +484,7 @@ static void orangefs_invalidatepage(struct page *page,
* Thus the following runs if wr was modified above.
*/
- orangefs_launder_page(page);
+ orangefs_launder_folio(folio);
}
static int orangefs_releasepage(struct page *page, gfp_t foo)
@@ -496,17 +497,17 @@ static void orangefs_freepage(struct page *page)
kfree(detach_page_private(page));
}
-static int orangefs_launder_page(struct page *page)
+static int orangefs_launder_folio(struct folio *folio)
{
int r = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0,
};
- wait_on_page_writeback(page);
- if (clear_page_dirty_for_io(page)) {
- r = orangefs_writepage_locked(page, &wbc);
- end_page_writeback(page);
+ folio_wait_writeback(folio);
+ if (folio_clear_dirty_for_io(folio)) {
+ r = orangefs_writepage_locked(&folio->page, &wbc);
+ folio_end_writeback(folio);
}
return r;
}
@@ -633,19 +634,19 @@ static const struct address_space_operations orangefs_address_operations = {
.readahead = orangefs_readahead,
.readpage = orangefs_readpage,
.writepages = orangefs_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = orangefs_write_begin,
.write_end = orangefs_write_end,
- .invalidatepage = orangefs_invalidatepage,
+ .invalidate_folio = orangefs_invalidate_folio,
.releasepage = orangefs_releasepage,
.freepage = orangefs_freepage,
- .launder_page = orangefs_launder_page,
+ .launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
{
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
unsigned long *bitlock = &orangefs_inode->bitlock;
@@ -659,27 +660,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
goto out;
}
- lock_page(page);
- if (PageDirty(page) && !PagePrivate(page)) {
+ folio_lock(folio);
+ if (folio_test_dirty(folio) && !folio_test_private(folio)) {
/*
- * Should be impossible. If it happens, launder the page
+ * Should be impossible. If it happens, launder the folio
* since we don't know what's dirty. This will WARN in
* orangefs_writepage_locked.
*/
- if (orangefs_launder_page(page)) {
+ if (orangefs_launder_folio(folio)) {
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
}
- if (PagePrivate(page)) {
- wr = (struct orangefs_write_range *)page_private(page);
+ if (folio_test_private(folio)) {
+ wr = folio_get_private(folio);
if (uid_eq(wr->uid, current_fsuid()) &&
gid_eq(wr->gid, current_fsgid())) {
- wr->pos = page_offset(page);
+ wr->pos = page_offset(vmf->page);
wr->len = PAGE_SIZE;
goto okay;
} else {
- if (orangefs_launder_page(page)) {
+ if (orangefs_launder_folio(folio)) {
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
@@ -690,27 +691,27 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
goto out;
}
- wr->pos = page_offset(page);
+ wr->pos = page_offset(vmf->page);
wr->len = PAGE_SIZE;
wr->uid = current_fsuid();
wr->gid = current_fsgid();
- attach_page_private(page, wr);
+ folio_attach_private(folio, wr);
okay:
file_update_time(vmf->vma->vm_file);
- if (page->mapping != inode->i_mapping) {
- unlock_page(page);
+ if (folio->mapping != inode->i_mapping) {
+ folio_unlock(folio);
ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
goto out;
}
/*
- * We mark the page dirty already here so that when freeze is in
+ * We mark the folio dirty already here so that when freeze is in
* progress, we are guaranteed that writeback during freezing will
- * see the dirty page and writeprotect it again.
+ * see the dirty folio and writeprotect it again.
*/
- set_page_dirty(page);
- wait_for_stable_page(page);
+ folio_mark_dirty(folio);
+ folio_wait_stable(folio);
ret = VM_FAULT_LOCKED;
out:
sb_end_pagefault(inode->i_sb);
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index d90d8addbfc2..5254256a224d 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -107,7 +107,7 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb)
{
struct orangefs_inode_s *orangefs_inode;
- orangefs_inode = kmem_cache_alloc(orangefs_inode_cache, GFP_KERNEL);
+ orangefs_inode = alloc_inode_sb(sb, orangefs_inode_cache, GFP_KERNEL);
if (!orangefs_inode)
return NULL;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index b193d08a3dc3..e040970408d4 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -145,7 +145,7 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
if (err == -ENOTTY || err == -EINVAL)
return 0;
pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
- old, err);
+ old->dentry, err);
return err;
}
@@ -157,7 +157,9 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
*/
if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) {
err = ovl_set_protattr(inode, new->dentry, &oldfa);
- if (err)
+ if (err == -EPERM)
+ pr_warn_once("copying fileattr: no xattr on upper\n");
+ else if (err)
return err;
}
@@ -167,8 +169,16 @@ static int ovl_copy_fileattr(struct inode *inode, struct path *old,
err = ovl_real_fileattr_get(new, &newfa);
if (err) {
+ /*
+ * Returning an error if upper doesn't support fileattr will
+ * result in a regression, so revert to the old behavior.
+ */
+ if (err == -ENOTTY || err == -EINVAL) {
+ pr_warn_once("copying fileattr: no support on upper\n");
+ return 0;
+ }
pr_warn("failed to retrieve upper fileattr (%pd2, err=%i)\n",
- new, err);
+ new->dentry, err);
return err;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7bb0a47cb615..001cdbb8f015 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -174,7 +174,7 @@ static struct kmem_cache *ovl_inode_cachep;
static struct inode *ovl_alloc_inode(struct super_block *sb)
{
- struct ovl_inode *oi = kmem_cache_alloc(ovl_inode_cachep, GFP_KERNEL);
+ struct ovl_inode *oi = alloc_inode_sb(sb, ovl_inode_cachep, GFP_KERNEL);
if (!oi)
return NULL;
diff --git a/fs/pipe.c b/fs/pipe.c
index cc28623a67b6..9648ac15164a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -253,7 +253,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
*/
was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
for (;;) {
- unsigned int head = pipe->head;
+ /* Read ->head with a barrier vs post_one_notification() */
+ unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
unsigned int mask = pipe->ring_size - 1;
@@ -606,7 +607,7 @@ out:
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- int count, head, tail, mask;
+ unsigned int count, head, tail, mask;
switch (cmd) {
case FIONREAD:
@@ -803,7 +804,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
goto out_revert_acct;
- pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
+ pipe->bufs = kvcalloc(pipe_bufs, sizeof(struct pipe_buffer),
GFP_KERNEL_ACCOUNT);
if (pipe->bufs) {
@@ -828,13 +829,11 @@ out_free_uid:
void free_pipe_info(struct pipe_inode_info *pipe)
{
- int i;
+ unsigned int i;
#ifdef CONFIG_WATCH_QUEUE
- if (pipe->watch_queue) {
+ if (pipe->watch_queue)
watch_queue_clear(pipe->watch_queue);
- put_watch_queue(pipe->watch_queue);
- }
#endif
(void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
@@ -844,9 +843,13 @@ void free_pipe_info(struct pipe_inode_info *pipe)
if (buf->ops)
pipe_buf_release(pipe, buf);
}
+#ifdef CONFIG_WATCH_QUEUE
+ if (pipe->watch_queue)
+ put_watch_queue(pipe->watch_queue);
+#endif
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
- kfree(pipe->bufs);
+ kvfree(pipe->bufs);
kfree(pipe);
}
@@ -1261,8 +1264,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
if (nr_slots < n)
return -EBUSY;
- bufs = kcalloc(nr_slots, sizeof(*bufs),
- GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
+ bufs = kvcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT);
if (unlikely(!bufs))
return -ENOMEM;
@@ -1289,7 +1291,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
head = n;
tail = 0;
- kfree(pipe->bufs);
+ kvfree(pipe->bufs);
pipe->bufs = bufs;
pipe->ring_size = nr_slots;
if (pipe->max_usage > nr_slots)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d654ce7150fd..76bf1aa3cfe8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1764,25 +1764,25 @@ out:
static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
{
- char *tmp = (char *)__get_free_page(GFP_KERNEL);
+ char *tmp = kmalloc(PATH_MAX, GFP_KERNEL);
char *pathname;
int len;
if (!tmp)
return -ENOMEM;
- pathname = d_path(path, tmp, PAGE_SIZE);
+ pathname = d_path(path, tmp, PATH_MAX);
len = PTR_ERR(pathname);
if (IS_ERR(pathname))
goto out;
- len = tmp + PAGE_SIZE - 1 - pathname;
+ len = tmp + PATH_MAX - 1 - pathname;
if (len > buflen)
len = buflen;
if (copy_to_user(buffer, pathname, len))
len = -EFAULT;
out:
- free_page((unsigned long)tmp);
+ kfree(tmp);
return len;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f84355c5a36d..73aeb4e6d32e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -66,7 +66,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
{
struct proc_inode *ei;
- ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, proc_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->pid = NULL;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9f1077d94cde..a2873a617ae8 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -10,6 +10,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
+#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 18f8c3acbb85..f46060eb91b5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -309,7 +309,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
- const char *anon_name;
+ struct anon_vma_name *anon_name;
if (!mm) {
name = "[vdso]";
@@ -327,10 +327,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- anon_name = vma_anon_name(vma);
+ anon_name = anon_vma_name(vma);
if (anon_name) {
seq_pad(m, ' ');
- seq_printf(m, "[anon:%s]", anon_name);
+ seq_printf(m, "[anon:%s]", anon_name->name);
}
}
@@ -440,7 +440,8 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
}
static void smaps_account(struct mem_size_stats *mss, struct page *page,
- bool compound, bool young, bool dirty, bool locked)
+ bool compound, bool young, bool dirty, bool locked,
+ bool migration)
{
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
@@ -467,8 +468,15 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* page_count(page) == 1 guarantees the page is mapped exactly once.
* If any subpage of the compound page mapped with PTE it would elevate
* page_count().
+ *
+ * The page_mapcount() is called to get a snapshot of the mapcount.
+ * Without holding the page lock this snapshot can be slightly wrong as
+ * we cannot always read the mapcount atomically. It is not safe to
+ * call page_mapcount() even with PTL held if the page is not mapped,
+ * especially for migration entries. Treat regular migration entries
+ * as mapcount == 1.
*/
- if (page_count(page) == 1) {
+ if ((page_count(page) == 1) || migration) {
smaps_page_accumulate(mss, page, size, size << PSS_SHIFT, dirty,
locked, true);
return;
@@ -517,6 +525,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
@@ -536,8 +545,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
} else {
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
- } else if (is_pfn_swap_entry(swpent))
+ } else if (is_pfn_swap_entry(swpent)) {
+ if (is_migration_entry(swpent))
+ migration = true;
page = pfn_swap_entry_to_page(swpent);
+ }
} else {
smaps_pte_hole_lookup(addr, walk);
return;
@@ -546,7 +558,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), locked);
+ smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
+ locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -557,6 +570,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool migration = false;
if (pmd_present(*pmd)) {
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -564,8 +578,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- if (is_migration_entry(entry))
+ if (is_migration_entry(entry)) {
+ migration = true;
page = pfn_swap_entry_to_page(entry);
+ }
}
if (IS_ERR_OR_NULL(page))
return;
@@ -577,7 +593,9 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
/* pass */;
else
mss->file_thp += HPAGE_PMD_SIZE;
- smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
+
+ smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
+ locked, migration);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -1378,6 +1396,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
+ bool migration = false;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1399,13 +1418,14 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
frame = swp_type(entry) |
(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
+ migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
}
if (page && !PageAnon(page))
flags |= PM_FILE;
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1421,8 +1441,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
spinlock_t *ptl;
pte_t *pte, *orig_pte;
int err = 0;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool migration = false;
+
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
u64 flags = 0, frame = 0;
@@ -1461,11 +1482,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
+ migration = is_migration_entry(entry);
page = pfn_swap_entry_to_page(entry);
}
#endif
- if (page && page_mapcount(page) == 1)
+ if (page && !migration && page_mapcount(page) == 1)
flags |= PM_MMAP_EXCLUSIVE;
for (; addr != end; addr += PAGE_SIZE) {
@@ -1575,7 +1597,8 @@ static const struct mm_walk_ops pagemap_ops = {
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
- * Bits 57-60 zero
+ * Bit 57 pte is uffd-wp write-protected
+ * Bits 58-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 702754dd1daf..6f1b8ddc6f7a 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -62,7 +62,8 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0);
/* Device Dump Size */
static size_t vmcoredd_orig_sz;
-static DECLARE_RWSEM(vmcore_cb_rwsem);
+static DEFINE_SPINLOCK(vmcore_cb_lock);
+DEFINE_STATIC_SRCU(vmcore_cb_srcu);
/* List of registered vmcore callbacks. */
static LIST_HEAD(vmcore_cb_list);
/* Whether the vmcore has been opened once. */
@@ -70,8 +71,8 @@ static bool vmcore_opened;
void register_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
INIT_LIST_HEAD(&cb->next);
+ spin_lock(&vmcore_cb_lock);
list_add_tail(&cb->next, &vmcore_cb_list);
/*
* Registering a vmcore callback after the vmcore was opened is
@@ -79,14 +80,14 @@ void register_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback registration\n");
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
}
EXPORT_SYMBOL_GPL(register_vmcore_cb);
void unregister_vmcore_cb(struct vmcore_cb *cb)
{
- down_write(&vmcore_cb_rwsem);
- list_del(&cb->next);
+ spin_lock(&vmcore_cb_lock);
+ list_del_rcu(&cb->next);
/*
* Unregistering a vmcore callback after the vmcore was opened is
* very unusual (e.g., forced driver removal), but we cannot stop
@@ -94,7 +95,9 @@ void unregister_vmcore_cb(struct vmcore_cb *cb)
*/
if (vmcore_opened)
pr_warn_once("Unexpected vmcore callback unregistration\n");
- up_write(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
+
+ synchronize_srcu(&vmcore_cb_srcu);
}
EXPORT_SYMBOL_GPL(unregister_vmcore_cb);
@@ -103,9 +106,8 @@ static bool pfn_is_ram(unsigned long pfn)
struct vmcore_cb *cb;
bool ret = true;
- lockdep_assert_held_read(&vmcore_cb_rwsem);
-
- list_for_each_entry(cb, &vmcore_cb_list, next) {
+ list_for_each_entry_srcu(cb, &vmcore_cb_list, next,
+ srcu_read_lock_held(&vmcore_cb_srcu)) {
if (unlikely(!cb->pfn_is_ram))
continue;
ret = cb->pfn_is_ram(cb, pfn);
@@ -118,9 +120,9 @@ static bool pfn_is_ram(unsigned long pfn)
static int open_vmcore(struct inode *inode, struct file *file)
{
- down_read(&vmcore_cb_rwsem);
+ spin_lock(&vmcore_cb_lock);
vmcore_opened = true;
- up_read(&vmcore_cb_rwsem);
+ spin_unlock(&vmcore_cb_lock);
return 0;
}
@@ -133,6 +135,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
unsigned long pfn, offset;
size_t nr_bytes;
ssize_t read = 0, tmp;
+ int idx;
if (!count)
return 0;
@@ -140,7 +143,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset = (unsigned long)(*ppos % PAGE_SIZE);
pfn = (unsigned long)(*ppos / PAGE_SIZE);
- down_read(&vmcore_cb_rwsem);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
do {
if (count > (PAGE_SIZE - offset))
nr_bytes = PAGE_SIZE - offset;
@@ -165,7 +168,7 @@ ssize_t read_from_oldmem(char *buf, size_t count,
offset, userbuf);
}
if (tmp < 0) {
- up_read(&vmcore_cb_rwsem);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return tmp;
}
@@ -176,8 +179,8 @@ ssize_t read_from_oldmem(char *buf, size_t count,
++pfn;
offset = 0;
} while (count);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
- up_read(&vmcore_cb_rwsem);
return read;
}
@@ -477,7 +480,7 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
- * @sizez: size of buffer
+ * @size: size of buffer
*
* If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
* the buffer to user-space by means of remap_vmalloc_range().
@@ -568,18 +571,18 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
- int ret;
+ int ret, idx;
/*
- * Check if oldmem_pfn_is_ram was registered to avoid
- * looping over all pages without a reason.
+ * Check if a callback was registered to avoid looping over all
+ * pages without a reason.
*/
- down_read(&vmcore_cb_rwsem);
+ idx = srcu_read_lock(&vmcore_cb_srcu);
if (!list_empty(&vmcore_cb_list))
ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
else
ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot);
- up_read(&vmcore_cb_rwsem);
+ srcu_read_unlock(&vmcore_cb_srcu, idx);
return ret;
}
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f243cb5e6a4f..e26162f102ff 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@ static void pstore_timer_kick(void)
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper,
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
- return;
- }
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
kmsg_dump_rewind(&iter);
@@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi)
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index fe5305028c6e..a89e33719fcf 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -263,10 +263,10 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
if (prz->corrected_bytes || prz->bad_blocks)
ret = snprintf(str, len, ""
- "\n%d Corrected bytes, %d unrecoverable blocks\n",
+ "\nECC: %d Corrected bytes, %d unrecoverable blocks\n",
prz->corrected_bytes, prz->bad_blocks);
else
- ret = snprintf(str, len, "\nNo errors detected\n");
+ ret = snprintf(str, len, "\nECC: No errors detected\n");
return ret;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3fb7fc819b4f..a635bb6615e9 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -338,7 +338,7 @@ static struct kmem_cache *qnx4_inode_cachep;
static struct inode *qnx4_alloc_inode(struct super_block *sb)
{
struct qnx4_inode_info *ei;
- ei = kmem_cache_alloc(qnx4_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, qnx4_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 61191f7bdf62..9d8e7e9788a1 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -597,7 +597,7 @@ static struct kmem_cache *qnx6_inode_cachep;
static struct inode *qnx6_alloc_inode(struct super_block *sb)
{
struct qnx6_inode_info *ei;
- ei = kmem_cache_alloc(qnx6_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, qnx6_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
return &ei->vfs_inode;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 22d904bde6ab..a74aef99bd3d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -690,9 +690,14 @@ int dquot_quota_sync(struct super_block *sb, int type)
/* This is not very clever (and fast) but currently I don't know about
* any other simple way of getting quota data to disk and we must get
* them there for userspace to be visible... */
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, 1);
- sync_blockdev(sb->s_bdev);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 1);
+ if (ret)
+ return ret;
+ }
+ ret = sync_blockdev(sb->s_bdev);
+ if (ret)
+ return ret;
/*
* Now when everything is written we can discard the pagecache so
diff --git a/fs/read_write.c b/fs/read_write.c
index 0074afa7ecb3..dc5000173b80 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -385,6 +385,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
return security_file_permission(file,
read_write == READ ? MAY_READ : MAY_WRITE);
}
+EXPORT_SYMBOL(rw_verify_area);
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
@@ -1617,24 +1618,16 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
return 0;
}
-/*
- * Performs necessary checks before doing a write
- *
- * Can adjust writing position or amount of bytes to write.
- * Returns appropriate error code that caller should return or
- * zero in case that write should be allowed.
- */
-ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+/* Like generic_write_checks(), but takes size of write instead of iter. */
+int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- loff_t count;
- int ret;
if (IS_SWAPFILE(inode))
return -ETXTBSY;
- if (!iov_iter_count(from))
+ if (!*count)
return 0;
/* FIXME: this is for backwards compatibility with 2.4 */
@@ -1644,8 +1637,23 @@ ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
- count = iov_iter_count(from);
- ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+ return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
+}
+EXPORT_SYMBOL(generic_write_checks_count);
+
+/*
+ * Performs necessary checks before doing a write
+ *
+ * Can adjust writing position or amount of bytes to write.
+ * Returns appropriate error code that caller should return or
+ * zero in case that write should be allowed.
+ */
+ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+ loff_t count = iov_iter_count(from);
+ int ret;
+
+ ret = generic_write_checks_count(iocb, &count);
if (ret)
return ret;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 8fd54ed8f844..33c8b0dd07a2 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,10 +1,14 @@
# SPDX-License-Identifier: GPL-2.0-only
config REISERFS_FS
- tristate "Reiserfs support"
+ tristate "Reiserfs support (deprecated)"
select CRC32
help
- Stores not just filenames but the files themselves in a balanced
- tree. Uses journalling.
+ Reiserfs is deprecated and scheduled to be removed from the kernel
+ in 2025. If you are still using it, please migrate to another
+ filesystem or tell us your usecase for reiserfs.
+
+ Reiserfs stores not just filenames but the files themselves in a
+ balanced tree. Uses journalling.
Balanced trees are more efficient than traditional file system
architectural foundations.
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f49b72ccac4c..36c59b25486c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2763,13 +2763,6 @@ static int reiserfs_write_begin(struct file *file,
int old_ref = 0;
inode = mapping->host;
- *fsdata = NULL;
- if (flags & AOP_FLAG_CONT_EXPAND &&
- (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
- pos ++;
- *fsdata = (void *)(unsigned long)flags;
- }
-
index = pos >> PAGE_SHIFT;
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -2896,9 +2889,6 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
unsigned start;
bool locked = false;
- if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
- pos ++;
-
reiserfs_wait_on_write_block(inode->i_sb);
if (reiserfs_transaction_running(inode->i_sb))
th = current->journal_info;
@@ -3094,7 +3084,7 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
* decide if this buffer needs to stay around for data logging or ordered
* write purposes
*/
-static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+static int invalidate_folio_can_drop(struct inode *inode, struct buffer_head *bh)
{
int ret = 1;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
@@ -3147,26 +3137,26 @@ free_jh:
return ret;
}
-/* clm -- taken from fs/buffer.c:block_invalidate_page */
-static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+/* clm -- taken from fs/buffer.c:block_invalidate_folio */
+static void reiserfs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
struct buffer_head *head, *bh, *next;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned int curr_off = 0;
unsigned int stop = offset + length;
- int partial_page = (offset || length < PAGE_SIZE);
+ int partial_page = (offset || length < folio_size(folio));
int ret = 1;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!partial_page)
- ClearPageChecked(page);
+ folio_clear_checked(folio);
- if (!page_has_buffers(page))
+ head = folio_buffers(folio);
+ if (!head)
goto out;
- head = page_buffers(page);
bh = head;
do {
unsigned int next_off = curr_off + bh->b_size;
@@ -3179,7 +3169,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
* is this block fully invalidated?
*/
if (offset <= curr_off) {
- if (invalidatepage_can_drop(inode, bh))
+ if (invalidate_folio_can_drop(inode, bh))
reiserfs_unmap_buffer(bh);
else
ret = 0;
@@ -3194,21 +3184,21 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
* so real IO is not possible anymore.
*/
if (!partial_page && ret) {
- ret = try_to_release_page(page, 0);
+ ret = filemap_release_folio(folio, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
out:
return;
}
-static int reiserfs_set_page_dirty(struct page *page)
+static bool reiserfs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- struct inode *inode = page->mapping->host;
- if (reiserfs_file_data_log(inode)) {
- SetPageChecked(page);
- return __set_page_dirty_nobuffers(page);
+ if (reiserfs_file_data_log(mapping->host)) {
+ folio_set_checked(folio);
+ return filemap_dirty_folio(mapping, folio);
}
- return __set_page_dirty_buffers(page);
+ return block_dirty_folio(mapping, folio);
}
/*
@@ -3316,7 +3306,11 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
/* fill in hole pointers in the expanding truncate case. */
if (attr->ia_size > inode->i_size) {
- error = generic_cont_expand_simple(inode, attr->ia_size);
+ loff_t pos = attr->ia_size;
+
+ if ((pos & (inode->i_sb->s_blocksize - 1)) == 0)
+ pos++;
+ error = generic_cont_expand_simple(inode, pos);
if (REISERFS_I(inode)->i_prealloc_count > 0) {
int err;
struct reiserfs_transaction_handle th;
@@ -3430,10 +3424,10 @@ const struct address_space_operations reiserfs_address_space_operations = {
.readpage = reiserfs_readpage,
.readahead = reiserfs_readahead,
.releasepage = reiserfs_releasepage,
- .invalidatepage = reiserfs_invalidatepage,
+ .invalidate_folio = reiserfs_invalidate_folio,
.write_begin = reiserfs_write_begin,
.write_end = reiserfs_write_end,
.bmap = reiserfs_aop_bmap,
.direct_IO = reiserfs_direct_IO,
- .set_page_dirty = reiserfs_set_page_dirty,
+ .dirty_folio = reiserfs_dirty_folio,
};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index a3e21160b634..b5b6f6201bed 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -858,8 +858,8 @@ loop_next:
ret = -EIO;
}
/*
- * ugly interaction with invalidatepage here.
- * reiserfs_invalidate_page will pin any buffer that has a
+ * ugly interaction with invalidate_folio here.
+ * reiserfs_invalidate_folio will pin any buffer that has a
* valid journal head from an older transaction. If someone
* else sets our buffer dirty after we write it in the first
* loop, and then someone truncates the page away, nobody
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 82e09901462e..cfb7c44c7366 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -639,7 +639,7 @@ static struct kmem_cache *reiserfs_inode_cachep;
static struct inode *reiserfs_alloc_inode(struct super_block *sb)
{
struct reiserfs_inode_info *ei;
- ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, reiserfs_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
atomic_set(&ei->openers, 0);
@@ -1652,6 +1652,8 @@ static int read_super_block(struct super_block *s, int offset)
return 1;
}
+ reiserfs_warning(NULL, "", "reiserfs filesystem is deprecated and "
+ "scheduled to be removed from the kernel in 2025");
SB_BUFFER_WITH_SB(s) = bh;
SB_DISK_SUPER_BLOCK(s) = rs;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 231159682907..e112b5424cdb 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -146,11 +146,11 @@ static int generic_remap_check_len(struct inode *inode_in,
}
/* Read a page's worth of file data into the page cache. */
-static struct folio *vfs_dedupe_get_folio(struct inode *inode, loff_t pos)
+static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
{
struct folio *folio;
- folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL);
+ folio = read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
if (IS_ERR(folio))
return folio;
if (!folio_test_uptodate(folio)) {
@@ -187,8 +187,8 @@ static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2)
* Compare extents of two files to see if they are the same.
* Caller must have locked both inodes to prevent write races.
*/
-static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- struct inode *dest, loff_t dstoff,
+static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
+ struct file *dest, loff_t dstoff,
loff_t len, bool *is_same)
{
bool same = true;
@@ -224,8 +224,8 @@ static int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
* someone is invalidating pages on us and we lose.
*/
if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) ||
- src_folio->mapping != src->i_mapping ||
- dst_folio->mapping != dest->i_mapping) {
+ src_folio->mapping != src->f_mapping ||
+ dst_folio->mapping != dest->f_mapping) {
same = false;
goto unlock;
}
@@ -333,8 +333,8 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
if (remap_flags & REMAP_FILE_DEDUP) {
bool is_same = false;
- ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
- inode_out, pos_out, *len, &is_same);
+ ret = vfs_dedupe_file_range_compare(file_in, pos_in,
+ file_out, pos_out, *len, &is_same);
if (ret)
return ret;
if (!is_same)
@@ -362,11 +362,6 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
- /*
- * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
- * the same mount. Practically, they only need to be on the same file
- * system.
- */
if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
return -EXDEV;
@@ -458,7 +453,7 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
goto out_drop_write;
ret = -EXDEV;
- if (src_file->f_path.mnt != dst_file->f_path.mnt)
+ if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb)
goto out_drop_write;
ret = -EISDIR;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 259f684d9236..9e6bbb4219de 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -375,7 +375,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
{
struct romfs_inode_info *inode;
- inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
+ inode = alloc_inode_sb(sb, romfs_inode_cachep, GFP_KERNEL);
return inode ? &inode->vfs_inode : NULL;
}
diff --git a/fs/splice.c b/fs/splice.c
index 5dbce4dcc1a7..047b79db8eb5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -46,45 +46,45 @@
static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = page_folio(buf->page);
struct address_space *mapping;
- lock_page(page);
+ folio_lock(folio);
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (mapping) {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
/*
* At least for ext2 with nobh option, we need to wait on
- * writeback completing on this page, since we'll remove it
+ * writeback completing on this folio, since we'll remove it
* from the pagecache. Otherwise truncate wont wait on the
- * page, allowing the disk blocks to be reused by someone else
+ * folio, allowing the disk blocks to be reused by someone else
* before we actually wrote our data to them. fs corruption
* ensues.
*/
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL))
goto out_unlock;
/*
* If we succeeded in removing the mapping, set LRU flag
* and return good.
*/
- if (remove_mapping(mapping, page)) {
+ if (remove_mapping(mapping, folio)) {
buf->flags |= PIPE_BUF_FLAG_LRU;
return true;
}
}
/*
- * Raced with truncate or failed to remove page from current
+ * Raced with truncate or failed to remove folio from current
* address space, unlock and return failure.
*/
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
return false;
}
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index b1b556dbce12..4f74abbc1a54 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -584,7 +584,7 @@ static void __exit exit_squashfs_fs(void)
static struct inode *squashfs_alloc_inode(struct super_block *sb)
{
struct squashfs_inode_info *ei =
- kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL);
+ alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL);
return ei ? &ei->vfs_inode : NULL;
}
diff --git a/fs/stat.c b/fs/stat.c
index 28d2020ba1f4..7f734be0e57e 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -184,6 +184,20 @@ int vfs_fstat(int fd, struct kstat *stat)
return error;
}
+int getname_statx_lookup_flags(int flags)
+{
+ int lookup_flags = 0;
+
+ if (!(flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+ if (!(flags & AT_NO_AUTOMOUNT))
+ lookup_flags |= LOOKUP_AUTOMOUNT;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ return lookup_flags;
+}
+
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
@@ -199,26 +213,19 @@ int vfs_fstat(int fd, struct kstat *stat)
*
* 0 will be returned on success, and a -ve error code if unsuccessful.
*/
-static int vfs_statx(int dfd, const char __user *filename, int flags,
+static int vfs_statx(int dfd, struct filename *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- unsigned lookup_flags = 0;
+ unsigned int lookup_flags = getname_statx_lookup_flags(flags);
int error;
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
AT_STATX_SYNC_TYPE))
return -EINVAL;
- if (!(flags & AT_SYMLINK_NOFOLLOW))
- lookup_flags |= LOOKUP_FOLLOW;
- if (!(flags & AT_NO_AUTOMOUNT))
- lookup_flags |= LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
retry:
- error = user_path_at(dfd, filename, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
goto out;
@@ -240,8 +247,15 @@ out:
int vfs_fstatat(int dfd, const char __user *filename,
struct kstat *stat, int flags)
{
- return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT,
- stat, STATX_BASIC_STATS);
+ int ret;
+ int statx_flags = flags | AT_NO_AUTOMOUNT;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
+ ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
+ putname(name);
+
+ return ret;
}
#ifdef __ARCH_WANT_OLD_STAT
@@ -602,7 +616,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
-int do_statx(int dfd, const char __user *filename, unsigned flags,
+int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer)
{
struct kstat stat;
@@ -636,7 +650,14 @@ SYSCALL_DEFINE5(statx,
unsigned int, mask,
struct statx __user *, buffer)
{
- return do_statx(dfd, filename, flags, mask, buffer);
+ int ret;
+ struct filename *name;
+
+ name = getname_flags(filename, getname_statx_lookup_flags(flags), NULL);
+ ret = do_statx(dfd, name, flags, mask, buffer);
+ putname(name);
+
+ return ret;
}
#ifdef CONFIG_COMPAT
diff --git a/fs/super.c b/fs/super.c
index 7af820ba5ad5..f1d4a193602d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1616,11 +1616,9 @@ static void lockdep_sb_freeze_acquire(struct super_block *sb)
percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}
-static void sb_freeze_unlock(struct super_block *sb)
+static void sb_freeze_unlock(struct super_block *sb, int level)
{
- int level;
-
- for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
+ for (level--; level >= 0; level--)
percpu_up_write(sb->s_writers.rw_sem + level);
}
@@ -1691,7 +1689,14 @@ int freeze_super(struct super_block *sb)
sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
/* All writers are done so after syncing there won't be dirty data */
- sync_filesystem(sb);
+ ret = sync_filesystem(sb);
+ if (ret) {
+ sb->s_writers.frozen = SB_UNFROZEN;
+ sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
+ wake_up(&sb->s_writers.wait_unfrozen);
+ deactivate_locked_super(sb);
+ return ret;
+ }
/* Now wait for internal filesystem counter */
sb->s_writers.frozen = SB_FREEZE_FS;
@@ -1703,7 +1708,7 @@ int freeze_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
- sb_freeze_unlock(sb);
+ sb_freeze_unlock(sb, SB_FREEZE_FS);
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return ret;
@@ -1748,7 +1753,7 @@ static int thaw_super_locked(struct super_block *sb)
}
sb->s_writers.frozen = SB_UNFROZEN;
- sb_freeze_unlock(sb);
+ sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
diff --git a/fs/sync.c b/fs/sync.c
index 3ce8e2137f31..c7690016453e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -29,7 +29,7 @@
*/
int sync_filesystem(struct super_block *sb)
{
- int ret;
+ int ret = 0;
/*
* We need to be protected against the filesystem going from
@@ -52,15 +52,21 @@ int sync_filesystem(struct super_block *sb)
* at a time.
*/
writeback_inodes_sb(sb, WB_REASON_SYNC);
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, 0);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 0);
+ if (ret)
+ return ret;
+ }
ret = sync_blockdev_nowait(sb->s_bdev);
- if (ret < 0)
+ if (ret)
return ret;
sync_inodes_sb(sb);
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, 1);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 1);
+ if (ret)
+ return ret;
+ }
return sync_blockdev(sb->s_bdev);
}
EXPORT_SYMBOL(sync_filesystem);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index be47263b8605..9e8d4a6fb2f3 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -306,7 +306,7 @@ static struct inode *sysv_alloc_inode(struct super_block *sb)
{
struct sysv_inode_info *si;
- si = kmem_cache_alloc(sysv_inode_cachep, GFP_KERNEL);
+ si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL);
if (!si)
return NULL;
return &si->vfs_inode;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 749385015a8d..409ab5e17803 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -495,7 +495,8 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations sysv_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = sysv_readpage,
.writepage = sysv_writepage,
.write_begin = sysv_write_begin,
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index bafc02bf8220..de7252715b12 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -264,7 +264,6 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
if (!gid_valid(gid))
return -EINVAL;
opts->gid = gid;
- set_gid(tracefs_mount->mnt_root, gid);
break;
case Opt_mode:
if (match_octal(&args[0], &option))
@@ -291,7 +290,9 @@ static int tracefs_apply_options(struct super_block *sb)
inode->i_mode |= opts->mode;
inode->i_uid = opts->uid;
- inode->i_gid = opts->gid;
+
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
return 0;
}
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5cfa28cd00cd..8a9ffc2d4167 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1287,25 +1287,25 @@ int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return err;
}
-static void ubifs_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
+static void ubifs_invalidate_folio(struct folio *folio, size_t offset,
+ size_t length)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
- ubifs_assert(c, PagePrivate(page));
- if (offset || length < PAGE_SIZE)
- /* Partial page remains dirty */
+ ubifs_assert(c, folio_test_private(folio));
+ if (offset || length < folio_size(folio))
+ /* Partial folio remains dirty */
return;
- if (PageChecked(page))
+ if (folio_test_checked(folio))
release_new_page_budget(c);
else
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
- ClearPageChecked(page);
+ folio_clear_private(folio);
+ folio_clear_checked(folio);
}
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -1445,18 +1445,18 @@ static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
return generic_file_write_iter(iocb, from);
}
-static int ubifs_set_page_dirty(struct page *page)
+static bool ubifs_dirty_folio(struct address_space *mapping,
+ struct folio *folio)
{
- int ret;
- struct inode *inode = page->mapping->host;
- struct ubifs_info *c = inode->i_sb->s_fs_info;
+ bool ret;
+ struct ubifs_info *c = mapping->host->i_sb->s_fs_info;
- ret = __set_page_dirty_nobuffers(page);
+ ret = filemap_dirty_folio(mapping, folio);
/*
* An attempt to dirty a page without budgeting for it - should not
* happen.
*/
- ubifs_assert(c, ret == 0);
+ ubifs_assert(c, ret == false);
return ret;
}
@@ -1646,8 +1646,8 @@ const struct address_space_operations ubifs_file_address_operations = {
.writepage = ubifs_writepage,
.write_begin = ubifs_write_begin,
.write_end = ubifs_write_end,
- .invalidatepage = ubifs_invalidatepage,
- .set_page_dirty = ubifs_set_page_dirty,
+ .invalidate_folio = ubifs_invalidate_folio,
+ .dirty_folio = ubifs_dirty_folio,
#ifdef CONFIG_MIGRATION
.migratepage = ubifs_migrate_page,
#endif
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index aa7a1381c457..bad67455215f 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -268,7 +268,7 @@ static struct inode *ubifs_alloc_inode(struct super_block *sb)
{
struct ubifs_inode *ui;
- ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+ ui = alloc_inode_sb(sb, ubifs_inode_slab, GFP_NOFS);
if (!ui)
return NULL;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1baff8ddb754..0f6bf2504437 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,7 +125,8 @@ static int udf_adinicb_write_end(struct file *file, struct address_space *mappin
}
const struct address_space_operations udf_adinicb_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = udf_adinicb_readpage,
.writepage = udf_adinicb_writepage,
.write_begin = udf_adinicb_write_begin,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index ea8f6cd01f50..ca4fa710e562 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -235,7 +235,8 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations udf_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = udf_readpage,
.readahead = udf_readahead,
.writepage = udf_writepage,
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f26b5e0b84b6..4042d9739fb7 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -136,7 +136,7 @@ static struct kmem_cache *udf_inode_cachep;
static struct inode *udf_alloc_inode(struct super_block *sb)
{
struct udf_inode_info *ei;
- ei = kmem_cache_alloc(udf_inode_cachep, GFP_KERNEL);
+ ei = alloc_inode_sb(sb, udf_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -2474,7 +2474,6 @@ static unsigned int udf_count_free_table(struct super_block *sb,
unsigned int accum = 0;
uint32_t elen;
struct kernel_lb_addr eloc;
- int8_t etype;
struct extent_position epos;
mutex_lock(&UDF_SB(sb)->s_alloc_mutex);
@@ -2482,7 +2481,7 @@ static unsigned int udf_count_free_table(struct super_block *sb,
epos.offset = sizeof(struct unallocSpaceEntry);
epos.bh = NULL;
- while ((etype = udf_next_aext(table, &epos, &eloc, &elen, 1)) != -1)
+ while (udf_next_aext(table, &epos, &eloc, &elen, 1) != -1)
accum += (elen >> table->i_sb->s_blocksize_bits);
brelse(epos.bh);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ac628de69601..d0dda01620f0 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,7 +526,8 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
}
const struct address_space_operations ufs_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = ufs_readpage,
.writepage = ufs_writepage,
.write_begin = ufs_write_begin,
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 00a01471ea05..23377c1baed9 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1443,7 +1443,7 @@ static struct inode *ufs_alloc_inode(struct super_block *sb)
{
struct ufs_inode_info *ei;
- ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+ ei = alloc_inode_sb(sb, ufs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 610d7bc05d6e..da786a687fdc 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -3,21 +3,13 @@
# UTF-8 normalization
#
config UNICODE
- bool "UTF-8 normalization and casefolding support"
+ tristate "UTF-8 normalization and casefolding support"
help
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
- support.
-
-config UNICODE_UTF8_DATA
- tristate "UTF-8 normalization and casefolding tables"
- depends on UNICODE
- default UNICODE
- help
- This contains a large table of case foldings, which can be loaded as
- a separate module if you say M here. To be on the safe side stick
- to the default of Y. Saying N here makes no sense, if you do not want
- utf8 casefolding support, disable CONFIG_UNICODE instead.
+ support. If you say M here the large table of case foldings will
+ be a separate loadable module that gets requested only when a file
+ system actually use it.
config UNICODE_NORMALIZATION_SELFTEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE_UTF8_DATA
+ depends on UNICODE
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index 2f9d9188852b..0cc87423de82 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,8 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_UNICODE) += unicode.o
+ifneq ($(CONFIG_UNICODE),)
+obj-y += unicode.o
+endif
+obj-$(CONFIG_UNICODE) += utf8data.o
obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
-obj-$(CONFIG_UNICODE_UTF8_DATA) += utf8data.o
unicode-y := utf8-norm.o utf8-core.o
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e26b10132d47..aa0c47cb0d16 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
struct uffd_msg msg;
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
+
+ if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
+ address &= PAGE_MASK;
msg.arg.pagefault.address = address;
/*
* These flags indicate why the userfault occurred:
@@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+ uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
@@ -878,7 +881,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev)
vma = prev;
else
@@ -1438,7 +1441,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }),
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1615,7 +1618,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
- NULL_VM_UFFD_CTX, vma_anon_name(vma));
+ NULL_VM_UFFD_CTX, anon_vma_name(vma));
if (prev) {
vma = prev;
goto next;
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 864c2fad23be..d74e0d336995 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -354,7 +354,7 @@ out:
const struct address_space_operations vboxsf_reg_aops = {
.readpage = vboxsf_readpage,
.writepage = vboxsf_writepage,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.write_begin = simple_write_begin,
.write_end = vboxsf_write_end,
};
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 37dd3fe5b1e9..d2f6df69f611 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -241,7 +241,7 @@ static struct inode *vboxsf_alloc_inode(struct super_block *sb)
{
struct vboxsf_inode *sf_i;
- sf_i = kmem_cache_alloc(vboxsf_inode_cachep, GFP_NOFS);
+ sf_i = alloc_inode_sb(sb, vboxsf_inode_cachep, GFP_NOFS);
if (!sf_i)
return NULL;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index aec2ebf7d25a..e1db0f3f7e5e 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -9,6 +9,7 @@
#include <linux/namei.h>
#include <linux/nls.h>
#include <linux/sizes.h>
+#include <linux/pagemap.h>
#include <linux/vfs.h>
#include "vfsmod.h"
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f18a875f51c6..c1500b238520 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2818,7 +2818,7 @@ xfs_btree_split_worker(
* in any way.
*/
if (args->kswapd)
- new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+ new_pflags |= PF_MEMALLOC | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
xfs_trans_set_context(args->cur->bc_tp);
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 50546eadaae2..5f1e4799e8fa 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -19,7 +19,11 @@
#include "xfs_error.h"
#include "xfs_trace.h"
-struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
+const struct xfs_name xfs_name_dotdot = {
+ .name = (const unsigned char *)"..",
+ .len = 2,
+ .type = XFS_DIR3_FT_DIR,
+};
/*
* Convert inode mode to directory entry filetype
@@ -54,10 +58,10 @@ xfs_mode_to_ftype(
*/
xfs_dahash_t
xfs_ascii_ci_hashname(
- struct xfs_name *name)
+ const struct xfs_name *name)
{
- xfs_dahash_t hash;
- int i;
+ xfs_dahash_t hash;
+ int i;
for (i = 0, hash = 0; i < name->len; i++)
hash = tolower(name->name[i]) ^ rol32(hash, 7);
@@ -243,7 +247,7 @@ int
xfs_dir_createname(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name,
+ const struct xfs_name *name,
xfs_ino_t inum, /* new entry inode number */
xfs_extlen_t total) /* bmap's total block count */
{
@@ -337,16 +341,16 @@ xfs_dir_cilookup_result(
int
xfs_dir_lookup(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_ino_t *inum, /* out: inode number */
- struct xfs_name *ci_name) /* out: actual name if CI match */
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
{
- struct xfs_da_args *args;
- int rval;
- int v; /* type-checking value */
- int lock_mode;
+ struct xfs_da_args *args;
+ int rval;
+ int v; /* type-checking value */
+ int lock_mode;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
@@ -475,7 +479,7 @@ int
xfs_dir_replace(
struct xfs_trans *tp,
struct xfs_inode *dp,
- struct xfs_name *name, /* name of entry to replace */
+ const struct xfs_name *name, /* name of entry to replace */
xfs_ino_t inum, /* new inode number */
xfs_extlen_t total) /* bmap's total block count */
{
@@ -728,7 +732,7 @@ xfs_dir2_namecheck(
xfs_dahash_t
xfs_dir2_hashname(
struct xfs_mount *mp,
- struct xfs_name *name)
+ const struct xfs_name *name)
{
if (unlikely(xfs_has_asciici(mp)))
return xfs_ascii_ci_hashname(name);
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index d03e6098ded9..b6df3c34b26a 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -21,7 +21,7 @@ struct xfs_dir2_data_unused;
struct xfs_dir3_icfree_hdr;
struct xfs_dir3_icleaf_hdr;
-extern struct xfs_name xfs_name_dotdot;
+extern const struct xfs_name xfs_name_dotdot;
/*
* Convert inode mode to directory entry filetype
@@ -39,16 +39,16 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t *inum,
+ const struct xfs_name *name, xfs_ino_t *inum,
struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t inum,
+ const struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name);
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 711709a2aa53..7404a9ff1a92 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -40,7 +40,7 @@ struct xfs_dir3_icfree_hdr {
};
/* xfs_dir2.c */
-xfs_dahash_t xfs_ascii_ci_hashname(struct xfs_name *name);
+xfs_dahash_t xfs_ascii_ci_hashname(const struct xfs_name *name);
enum xfs_dacmp xfs_ascii_ci_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -201,7 +201,8 @@ xfs_dir2_data_entsize(
return round_up(len, XFS_DIR2_DATA_ALIGN);
}
-xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name);
+xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp,
+ const struct xfs_name *name);
enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h
index 1719e1c4da59..3590e10e3e62 100644
--- a/fs/xfs/scrub/attr.h
+++ b/fs/xfs/scrub/attr.h
@@ -24,7 +24,7 @@ struct xchk_xattr_buf {
* space bitmap follows immediately after; and we have a third buffer
* for storing intermediate bitmap results.
*/
- uint8_t buf[0];
+ uint8_t buf[];
};
/* A place to store attribute values. */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 2705f91bdd0d..90b7f4d127de 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -136,7 +136,20 @@ done:
memalloc_nofs_restore(nofs_flag);
}
-/* Finish all pending io completions. */
+/*
+ * Finish all pending IO completions that require transactional modifications.
+ *
+ * We try to merge physical and logically contiguous ioends before completion to
+ * minimise the number of transactions we need to perform during IO completion.
+ * Both unwritten extent conversion and COW remapping need to iterate and modify
+ * one physical extent at a time, so we gain nothing by merging physically
+ * discontiguous extents here.
+ *
+ * The ioend chain length that we can be processing here is largely unbound in
+ * length and we may have to perform significant amounts of work on each ioend
+ * to complete it. Hence we have to be careful about holding the CPU for too
+ * long in this loop.
+ */
void
xfs_end_io(
struct work_struct *work)
@@ -157,6 +170,7 @@ xfs_end_io(
list_del_init(&ioend->io_list);
iomap_ioend_try_merge(ioend, &tmp);
xfs_end_ioend(ioend);
+ cond_resched();
}
}
@@ -553,9 +567,9 @@ const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
@@ -567,7 +581,6 @@ const struct address_space_operations xfs_address_space_operations = {
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.direct_IO = noop_direct_IO,
- .set_page_dirty = __set_page_dirty_no_writeback,
- .invalidatepage = noop_invalidatepage,
+ .dirty_folio = noop_dirty_folio,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e1f4d7d5a011..761dde155099 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -463,7 +463,7 @@ xfs_bui_item_recover(
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_map_extent *bmap;
struct xfs_bud_log_item *budp;
xfs_filblks_t count;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index d4a387d3d0ce..eb2e387ba528 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -850,9 +850,6 @@ xfs_alloc_file_space(
rblocks = 0;
}
- /*
- * Allocate and setup the transaction.
- */
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
dblocks, rblocks, false, &tp);
if (error)
@@ -869,9 +866,9 @@ xfs_alloc_file_space(
if (error)
goto error;
- /*
- * Complete the transaction
- */
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index ae87fd95b17e..e1afb9e503e1 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -14,6 +14,7 @@
#include "xfs_trace.h"
#include "xfs_log.h"
#include "xfs_log_recover.h"
+#include "xfs_log_priv.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_errortag.h"
@@ -813,7 +814,15 @@ xfs_buf_read_map(
* buffer.
*/
if (error) {
- if (!xfs_is_shutdown(target->bt_mount))
+ /*
+ * Check against log shutdown for error reporting because
+ * metadata writeback may require a read first and we need to
+ * report errors in metadata writeback until the log is shut
+ * down. High level transaction read functions already check
+ * against mount shutdown, anyway, so we only need to be
+ * concerned about low level IO interactions here.
+ */
+ if (!xlog_is_shutdown(target->bt_mount->m_log))
xfs_buf_ioerror_alert(bp, fa);
bp->b_flags &= ~XBF_DONE;
@@ -843,9 +852,6 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
- if (bdi_read_congested(target->bt_bdev->bd_disk->bdi))
- return;
-
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@@ -1177,10 +1183,10 @@ xfs_buf_ioend_handle_error(
struct xfs_error_cfg *cfg;
/*
- * If we've already decided to shutdown the filesystem because of I/O
- * errors, there's no point in giving this a retry.
+ * If we've already shutdown the journal because of I/O errors, there's
+ * no point in giving this a retry.
*/
- if (xfs_is_shutdown(mp))
+ if (xlog_is_shutdown(mp->m_log))
goto out_stale;
xfs_buf_ioerror_alert_ratelimited(bp);
@@ -1591,8 +1597,23 @@ __xfs_buf_submit(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
- /* on shutdown we stale and complete the buffer immediately */
- if (xfs_is_shutdown(bp->b_mount)) {
+ /*
+ * On log shutdown we stale and complete the buffer immediately. We can
+ * be called to read the superblock before the log has been set up, so
+ * be careful checking the log state.
+ *
+ * Checking the mount shutdown state here can result in the log tail
+ * moving inappropriately on disk as the log may not yet be shut down.
+ * i.e. failing this buffer on mount shutdown can remove it from the AIL
+ * and move the tail of the log forwards without having written this
+ * buffer to disk. This corrupts the log tail state in memory, and
+ * because the log may not be shut down yet, it can then be propagated
+ * to disk before the log is shutdown. Hence we check log shutdown
+ * state here rather than mount state to avoid corrupting the log tail
+ * on shutdown.
+ */
+ if (bp->b_mount->m_log &&
+ xlog_is_shutdown(bp->b_mount->m_log)) {
xfs_buf_ioend_fail(bp);
return -EIO;
}
@@ -1806,10 +1827,10 @@ xfs_buftarg_drain(
* If one or more failed buffers were freed, that means dirty metadata
* was thrown away. This should only ever happen after I/O completion
* handling has elevated I/O error(s) to permanent failures and shuts
- * down the fs.
+ * down the journal.
*/
if (write_fail) {
- ASSERT(xfs_is_shutdown(btp->bt_mount));
+ ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
xfs_alert(btp->bt_mount,
"Please run xfs_repair to determine the extent of the problem.");
}
@@ -2092,12 +2113,13 @@ xfs_buf_delwri_submit_buffers(
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, buffer_list, b_list) {
if (!wait_list) {
+ if (!xfs_buf_trylock(bp))
+ continue;
if (xfs_buf_ispinned(bp)) {
+ xfs_buf_unlock(bp);
pinned++;
continue;
}
- if (!xfs_buf_trylock(bp))
- continue;
} else {
xfs_buf_lock(bp);
}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a7a8e4528881..522d450a94b1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -21,6 +21,7 @@
#include "xfs_dquot.h"
#include "xfs_trace.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
struct kmem_cache *xfs_buf_item_cache;
@@ -428,7 +429,7 @@ xfs_buf_item_format(
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_has_v3inodes(lip->li_mountp) ||
+ if (xfs_has_v3inodes(lip->li_log->l_mp) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
@@ -616,7 +617,7 @@ xfs_buf_item_put(
* that case, the bli is freed on buffer writeback completion.
*/
aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- xfs_is_shutdown(lip->li_mountp);
+ xlog_is_shutdown(lip->li_log);
dirty = bip->bli_flags & XFS_BLI_DIRTY;
if (dirty && !aborted)
return false;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 47ef9c9c5c17..0e50f2c9348e 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -604,7 +604,7 @@ xfs_efi_item_recover(
struct list_head *capture_list)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
struct xfs_extent *extp;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 22ad207bedf4..5bddb1e9e0b3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -66,40 +66,6 @@ xfs_is_falloc_aligned(
return !((pos | len) & mask);
}
-int
-xfs_update_prealloc_flags(
- struct xfs_inode *ip,
- enum xfs_prealloc_flags flags)
-{
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
- 0, 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- if (!(flags & XFS_PREALLOC_INVISIBLE)) {
- VFS_I(ip)->i_mode &= ~S_ISUID;
- if (VFS_I(ip)->i_mode & S_IXGRP)
- VFS_I(ip)->i_mode &= ~S_ISGID;
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- }
-
- if (flags & XFS_PREALLOC_SET)
- ip->i_diflags |= XFS_DIFLAG_PREALLOC;
- if (flags & XFS_PREALLOC_CLEAR)
- ip->i_diflags &= ~XFS_DIFLAG_PREALLOC;
-
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (flags & XFS_PREALLOC_SYNC)
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
-}
-
/*
* Fsync operations on directories are much simpler than on regular files,
* as there is no file data to flush, and thus also no need for explicit
@@ -895,6 +861,21 @@ xfs_break_layouts(
return error;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (xfs_has_wsync(ip->i_mount))
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
@@ -910,7 +891,6 @@ xfs_file_fallocate(
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
- enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0;
bool do_file_insert = false;
@@ -955,6 +935,10 @@ xfs_file_fallocate(
goto out_unlock;
}
+ error = file_modified(file);
+ if (error)
+ goto out_unlock;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
@@ -1004,8 +988,6 @@ xfs_file_fallocate(
}
do_file_insert = true;
} else {
- flags |= XFS_PREALLOC_SET;
-
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode)) {
new_size = offset + len;
@@ -1057,13 +1039,6 @@ xfs_file_fallocate(
}
}
- if (file->f_flags & O_DSYNC)
- flags |= XFS_PREALLOC_SYNC;
-
- error = xfs_update_prealloc_flags(ip, flags);
- if (error)
- goto out_unlock;
-
/* Change file size if needed */
if (new_size) {
struct iattr iattr;
@@ -1082,8 +1057,14 @@ xfs_file_fallocate(
* leave shifted extents past EOF and hence losing access to
* the data that is contained within them.
*/
- if (do_file_insert)
+ if (do_file_insert) {
error = xfs_insert_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ }
+
+ if (xfs_file_sync_writes(file))
+ error = xfs_log_force_inode(ip);
out_unlock:
xfs_iunlock(ip, iolock);
@@ -1115,21 +1096,6 @@ xfs_file_fadvise(
return ret;
}
-/* Does this file, inode, or mount want synchronous writes? */
-static inline bool xfs_file_sync_writes(struct file *filp)
-{
- struct xfs_inode *ip = XFS_I(file_inode(filp));
-
- if (xfs_has_wsync(ip->i_mount))
- return true;
- if (filp->f_flags & (__O_SYNC | O_DSYNC))
- return true;
- if (IS_SYNC(file_inode(filp)))
- return true;
-
- return false;
-}
-
STATIC loff_t
xfs_file_remap_range(
struct file *file_in,
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 48287caad28b..10e1cb71439e 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -864,8 +864,8 @@ xfs_getfsmap(
!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
return -EINVAL;
- use_rmap = capable(CAP_SYS_ADMIN) &&
- xfs_has_rmapbt(mp);
+ use_rmap = xfs_has_rmapbt(mp) &&
+ has_capability_noaudit(current, CAP_SYS_ADMIN);
head->fmh_entries = 0;
/* Set up our device handlers. */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9644f938990c..20186c584c7d 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -23,6 +23,7 @@
#include "xfs_reflink.h"
#include "xfs_ialloc.h"
#include "xfs_ag.h"
+#include "xfs_log_priv.h"
#include <linux/iversion.h>
@@ -77,7 +78,7 @@ xfs_inode_alloc(
* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
* and return NULL here on ENOMEM.
*/
- ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
+ ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
if (inode_init_always(mp->m_super, VFS_I(ip))) {
kmem_cache_free(xfs_inode_cache, ip);
@@ -873,7 +874,14 @@ xfs_reclaim_inode(
if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
goto out_iunlock;
- if (xfs_is_shutdown(ip->i_mount)) {
+ /*
+ * Check for log shutdown because aborting the inode can move the log
+ * tail and corrupt in memory state. This is fine if the log is shut
+ * down, but if the log is still active and only the mount is shut down
+ * then the in-memory log tail movement caused by the abort can be
+ * incorrectly propagated to disk.
+ */
+ if (xlog_is_shutdown(ip->i_mount->m_log)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip);
goto reclaim;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 04bf467b1090..26227d26f274 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -35,6 +35,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
+#include "xfs_log_priv.h"
struct kmem_cache *xfs_inode_cache;
@@ -658,9 +659,9 @@ xfs_ip2xflags(
*/
int
xfs_lookup(
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_inode_t **ipp,
+ struct xfs_inode *dp,
+ const struct xfs_name *name,
+ struct xfs_inode **ipp,
struct xfs_name *ci_name)
{
xfs_ino_t inum;
@@ -1217,7 +1218,7 @@ xfs_link(
{
xfs_mount_t *mp = tdp->i_mount;
xfs_trans_t *tp;
- int error;
+ int error, nospace_error = 0;
int resblks;
trace_xfs_link(tdp, target_name);
@@ -1236,19 +1237,11 @@ xfs_link(
goto std_return;
resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &tp);
- }
+ error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks,
+ &tp, &nospace_error);
if (error)
goto std_return;
- xfs_lock_two_inodes(sip, XFS_ILOCK_EXCL, tdp, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
-
error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
XFS_IEXT_DIR_MANIP_CNT(mp));
if (error)
@@ -1306,6 +1299,8 @@ xfs_link(
error_return:
xfs_trans_cancel(tp);
std_return:
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
@@ -2755,6 +2750,7 @@ xfs_remove(
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp = NULL;
int is_dir = S_ISDIR(VFS_I(ip)->i_mode);
+ int dontcare;
int error = 0;
uint resblks;
@@ -2772,31 +2768,24 @@ xfs_remove(
goto std_return;
/*
- * We try to get the real space reservation first,
- * allowing for directory btree deletion(s) implying
- * possible bmap insert(s). If we can't get the space
- * reservation then we use 0 instead, and avoid the bmap
- * btree insert(s) in the directory code by, if the bmap
- * insert tries to happen, instead trimming the LAST
- * block from the directory.
+ * We try to get the real space reservation first, allowing for
+ * directory btree deletion(s) implying possible bmap insert(s). If we
+ * can't get the space reservation then we use 0 instead, and avoid the
+ * bmap btree insert(s) in the directory code by, if the bmap insert
+ * tries to happen, instead trimming the LAST block from the directory.
+ *
+ * Ignore EDQUOT and ENOSPC being returned via nospace_error because
+ * the directory code can handle a reservationless update and we don't
+ * want to prevent a user from trying to free space by deleting things.
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, resblks, 0, 0, &tp);
- if (error == -ENOSPC) {
- resblks = 0;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_remove, 0, 0, 0,
- &tp);
- }
+ error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks,
+ &tp, &dontcare);
if (error) {
ASSERT(error != -ENOSPC);
goto std_return;
}
- xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
/*
* If we're removing a directory perform some additional validation.
*/
@@ -3109,7 +3098,8 @@ xfs_rename(
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
int spaceres;
- int error;
+ bool retried = false;
+ int error, nospace_error = 0;
trace_xfs_rename(src_dp, target_dp, src_name, target_name);
@@ -3133,9 +3123,12 @@ xfs_rename(
xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
inodes, &num_inodes);
+retry:
+ nospace_error = 0;
spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp);
if (error == -ENOSPC) {
+ nospace_error = error;
spaceres = 0;
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0,
&tp);
@@ -3190,6 +3183,31 @@ xfs_rename(
spaceres);
/*
+ * Try to reserve quota to handle an expansion of the target directory.
+ * We'll allow the rename to continue in reservationless mode if we hit
+ * a space usage constraint. If we trigger reservationless mode, save
+ * the errno if there isn't any free space in the target directory.
+ */
+ if (spaceres != 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres,
+ 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(target_dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ nospace_error = error;
+ spaceres = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_trans_cancel;
+ }
+
+ /*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
*
@@ -3435,6 +3453,8 @@ out_trans_cancel:
out_release_wip:
if (wip)
xfs_irele(wip);
+ if (error == -ENOSPC && nospace_error)
+ error = nospace_error;
return error;
}
@@ -3659,7 +3679,7 @@ xfs_iflush_cluster(
* AIL, leaving a dirty/unpinned inode attached to the buffer
* that otherwise looks like it should be flushed.
*/
- if (xfs_is_shutdown(mp)) {
+ if (xlog_is_shutdown(mp->m_log)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -3685,9 +3705,19 @@ xfs_iflush_cluster(
}
if (error) {
+ /*
+ * Shutdown first so we kill the log before we release this
+ * buffer. If it is an INODE_ALLOC buffer and pins the tail
+ * of the log, failing it before the _log_ is shut down can
+ * result in the log tail being moved forward in the journal
+ * on disk because log writes can still be taking place. Hence
+ * unpinning the tail will allow the ICREATE intent to be
+ * removed from the log an recovery will fail with uninitialised
+ * inode cluster buffers.
+ */
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c447bf04205a..740ab13d1aa2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -402,7 +402,7 @@ enum layout_break_reason {
int xfs_release(struct xfs_inode *ip);
void xfs_inactive(struct xfs_inode *ip);
-int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
+int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct user_namespace *mnt_userns,
struct xfs_inode *dp, struct xfs_name *name,
@@ -462,15 +462,6 @@ xfs_itruncate_extents(
}
/* from xfs_file.c */
-enum xfs_prealloc_flags {
- XFS_PREALLOC_SET = (1 << 1),
- XFS_PREALLOC_CLEAR = (1 << 2),
- XFS_PREALLOC_SYNC = (1 << 3),
- XFS_PREALLOC_INVISIBLE = (1 << 4),
-};
-
-int xfs_update_prealloc_flags(struct xfs_inode *ip,
- enum xfs_prealloc_flags flags);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 90d8e591baf8..11158fa81a09 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,6 +17,7 @@
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_error.h"
#include <linux/iversion.h>
@@ -720,6 +721,17 @@ xfs_iflush_ail_updates(
if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
continue;
+ /*
+ * dgc: Not sure how this happens, but it happens very
+ * occassionaly via generic/388. xfs_iflush_abort() also
+ * silently handles this same "under writeback but not in AIL at
+ * shutdown" condition via xfs_trans_ail_delete().
+ */
+ if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
+ ASSERT(xlog_is_shutdown(lip->li_log));
+ continue;
+ }
+
lsn = xfs_ail_delete_one(ailp, lip);
if (!tail_lsn && lsn)
tail_lsn = lsn;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 03a6198c97f6..83481005317a 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1189,7 +1189,7 @@ xfs_ioctl_setattr_get_trans(
goto out_error;
error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp,
- capable(CAP_FOWNER), &tp);
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
goto out_error;
@@ -1464,7 +1464,7 @@ xfs_ioc_getbmap(
if (bmx.bmv_count < 2)
return -EINVAL;
- if (bmx.bmv_count > ULONG_MAX / recsize)
+ if (bmx.bmv_count >= INT_MAX / recsize)
return -ENOMEM;
buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b79b3846e71b..b34e8e4344a8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -613,37 +613,6 @@ xfs_vn_getattr(
return 0;
}
-static void
-xfs_setattr_mode(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
- umode_t mode = iattr->ia_mode;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= mode & ~S_IFMT;
-}
-
-void
-xfs_setattr_time(
- struct xfs_inode *ip,
- struct iattr *iattr)
-{
- struct inode *inode = VFS_I(ip);
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
- if (iattr->ia_valid & ATTR_ATIME)
- inode->i_atime = iattr->ia_atime;
- if (iattr->ia_valid & ATTR_CTIME)
- inode->i_ctime = iattr->ia_ctime;
- if (iattr->ia_valid & ATTR_MTIME)
- inode->i_mtime = iattr->ia_mtime;
-}
-
static int
xfs_vn_change_ok(
struct user_namespace *mnt_userns,
@@ -678,10 +647,10 @@ xfs_setattr_nonsize(
int mask = iattr->ia_valid;
xfs_trans_t *tp;
int error;
- kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID;
- kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID;
+ kuid_t uid = GLOBAL_ROOT_UID;
+ kgid_t gid = GLOBAL_ROOT_GID;
struct xfs_dquot *udqp = NULL, *gdqp = NULL;
- struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL;
+ struct xfs_dquot *old_udqp = NULL, *old_gdqp = NULL;
ASSERT((mask & ATTR_SIZE) == 0);
@@ -723,66 +692,30 @@ xfs_setattr_nonsize(
}
error = xfs_trans_alloc_ichange(ip, udqp, gdqp, NULL,
- capable(CAP_FOWNER), &tp);
+ has_capability_noaudit(current, CAP_FOWNER), &tp);
if (error)
goto out_dqrele;
/*
- * Change file ownership. Must be the owner or privileged.
+ * Register quota modifications in the transaction. Must be the owner
+ * or privileged. These IDs could have changed since we last looked at
+ * them. But, we're assured that if the ownership did change while we
+ * didn't have the inode locked, inode's dquot(s) would have changed
+ * also.
*/
- if (mask & (ATTR_UID|ATTR_GID)) {
- /*
- * These IDs could have changed since we last looked at them.
- * But, we're assured that if the ownership did change
- * while we didn't have the inode locked, inode's dquot(s)
- * would have changed also.
- */
- iuid = inode->i_uid;
- igid = inode->i_gid;
- gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
- uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
-
- /*
- * CAP_FSETID overrides the following restrictions:
- *
- * The set-user-ID and set-group-ID bits of a file will be
- * cleared upon successful return from chown()
- */
- if ((inode->i_mode & (S_ISUID|S_ISGID)) &&
- !capable(CAP_FSETID))
- inode->i_mode &= ~(S_ISUID|S_ISGID);
-
- /*
- * Change the ownerships and register quota modifications
- * in the transaction.
- */
- if (!uid_eq(iuid, uid)) {
- if (XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(mask & ATTR_UID);
- ASSERT(udqp);
- olddquot1 = xfs_qm_vop_chown(tp, ip,
- &ip->i_udquot, udqp);
- }
- inode->i_uid = uid;
- }
- if (!gid_eq(igid, gid)) {
- if (XFS_IS_GQUOTA_ON(mp)) {
- ASSERT(xfs_has_pquotino(mp) ||
- !XFS_IS_PQUOTA_ON(mp));
- ASSERT(mask & ATTR_GID);
- ASSERT(gdqp);
- olddquot2 = xfs_qm_vop_chown(tp, ip,
- &ip->i_gdquot, gdqp);
- }
- inode->i_gid = gid;
- }
+ if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) &&
+ !uid_eq(inode->i_uid, iattr->ia_uid)) {
+ ASSERT(udqp);
+ old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
+ }
+ if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) &&
+ !gid_eq(inode->i_gid, iattr->ia_gid)) {
+ ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
+ ASSERT(gdqp);
+ old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
}
- if (mask & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
@@ -794,8 +727,8 @@ xfs_setattr_nonsize(
/*
* Release any dquot(s) the inode had kept before chown.
*/
- xfs_qm_dqrele(olddquot1);
- xfs_qm_dqrele(olddquot2);
+ xfs_qm_dqrele(old_udqp);
+ xfs_qm_dqrele(old_gdqp);
xfs_qm_dqrele(udqp);
xfs_qm_dqrele(gdqp);
@@ -1006,11 +939,8 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
- if (iattr->ia_valid & ATTR_MODE)
- xfs_setattr_mode(ip, iattr);
- if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
- xfs_setattr_time(ip, iattr);
-
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(mnt_userns, inode, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
XFS_STATS_INC(mp, xs_ig_attrchg);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 16f9edbda4eb..a8034c0afdf2 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -812,10 +812,9 @@ xfs_log_mount_finish(
* mount failure occurs.
*/
mp->m_super->s_flags |= SB_ACTIVE;
+ xfs_log_work_queue(mp);
if (xlog_recovery_needed(log))
error = xlog_recover_finish(log);
- if (!error)
- xfs_log_work_queue(mp);
mp->m_super->s_flags &= ~SB_ACTIVE;
evict_inodes(mp->m_super);
@@ -1102,7 +1101,7 @@ xfs_log_item_init(
int type,
const struct xfs_item_ops *ops)
{
- item->li_mountp = mp;
+ item->li_log = mp->m_log;
item->li_ailp = mp->m_ail;
item->li_type = type;
item->li_ops = ops;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 83a039762b81..796e4464f809 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -1243,18 +1243,27 @@ xlog_cil_push_now(
if (!async)
flush_workqueue(cil->xc_push_wq);
+ spin_lock(&cil->xc_push_lock);
+
+ /*
+ * If this is an async flush request, we always need to set the
+ * xc_push_commit_stable flag even if something else has already queued
+ * a push. The flush caller is asking for the CIL to be on stable
+ * storage when the next push completes, so regardless of who has queued
+ * the push, the flush requires stable semantics from it.
+ */
+ cil->xc_push_commit_stable = async;
+
/*
* If the CIL is empty or we've already pushed the sequence then
- * there's no work we need to do.
+ * there's no more work that we need to do.
*/
- spin_lock(&cil->xc_push_lock);
if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
spin_unlock(&cil->xc_push_lock);
return;
}
cil->xc_push_seq = push_seq;
- cil->xc_push_commit_stable = async;
queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work);
spin_unlock(&cil->xc_push_lock);
}
@@ -1352,6 +1361,13 @@ xlog_cil_flush(
trace_xfs_log_force(log->l_mp, seq, _RET_IP_);
xlog_cil_push_now(log, seq, true);
+
+ /*
+ * If the CIL is empty, make sure that any previous checkpoint that may
+ * still be in an active iclog is pushed to stable storage.
+ */
+ if (list_empty(&log->l_cilp->xc_cil))
+ xfs_log_force(log->l_mp, 0);
}
/*
@@ -1468,7 +1484,7 @@ bool
xfs_log_item_in_current_chkpt(
struct xfs_log_item *lip)
{
- struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp;
+ struct xfs_cil *cil = lip->li_log->l_cilp;
if (list_empty(&lip->li_cil))
return false;
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index d6334abbc0b3..37a24f0f7cd4 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -71,6 +71,40 @@ xfs_fs_get_uuid(
}
/*
+ * We cannot use file based VFS helpers such as file_modified() to update
+ * inode state as we modify the data/metadata in the inode here. Hence we have
+ * to open code the timestamp updates and SUID/SGID stripping. We also need
+ * to set the inode prealloc flag to ensure that the extents we allocate are not
+ * removed if the inode is reclaimed from memory before xfs_fs_block_commit()
+ * is from the client to indicate that data has been written and the file size
+ * can be extended.
+ */
+static int
+xfs_fs_map_update_inode(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
+ 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ VFS_I(ip)->i_mode &= ~S_ISUID;
+ if (VFS_I(ip)->i_mode & S_IXGRP)
+ VFS_I(ip)->i_mode &= ~S_ISGID;
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ ip->i_diflags |= XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return xfs_trans_commit(tp);
+}
+
+/*
* Get a layout for the pNFS client.
*/
int
@@ -164,10 +198,12 @@ xfs_fs_map_blocks(
* that the blocks allocated and handed out to the client are
* guaranteed to be present even after a server crash.
*/
- error = xfs_update_prealloc_flags(ip,
- XFS_PREALLOC_SET | XFS_PREALLOC_SYNC);
+ error = xfs_fs_map_update_inode(ip);
+ if (!error)
+ error = xfs_log_force_inode(ip);
if (error)
goto out_unlock;
+
} else {
xfs_iunlock(ip, lock_flags);
}
@@ -255,7 +291,7 @@ xfs_fs_commit_blocks(
length = end - start;
if (!length)
continue;
-
+
/*
* Make sure reads through the pagecache see the new data.
*/
@@ -283,7 +319,8 @@ xfs_fs_commit_blocks(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- xfs_setattr_time(ip, iattr);
+ ASSERT(!(iattr->ia_valid & (ATTR_UID | ATTR_GID)));
+ setattr_copy(&init_user_ns, inode, iattr);
if (update_isize) {
i_size_write(inode, iattr->ia_size);
ip->i_disk_size = iattr->ia_size;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 32ac8d9c8940..f165d1a3de1d 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -25,6 +25,7 @@
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_ialloc.h"
+#include "xfs_log_priv.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -121,8 +122,7 @@ xfs_qm_dqpurge(
struct xfs_dquot *dqp,
void *data)
{
- struct xfs_mount *mp = dqp->q_mount;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
int error = -EAGAIN;
xfs_dqlock(dqp);
@@ -157,7 +157,7 @@ xfs_qm_dqpurge(
}
ASSERT(atomic_read(&dqp->q_pincount) == 0);
- ASSERT(xfs_is_shutdown(mp) ||
+ ASSERT(xlog_is_shutdown(dqp->q_logitem.qli_item.li_log) ||
!test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
@@ -172,7 +172,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(mp, xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index d3da67772d57..0d868c93144d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -457,7 +457,7 @@ xfs_cui_item_recover(
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
xfs_fsblock_t new_fsb;
xfs_extlen_t new_len;
unsigned int refc_type;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index db70060e7bf6..54e68e5693fd 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -425,7 +425,10 @@ convert:
if (!convert_now || cmap->br_state == XFS_EXT_NORM)
return 0;
trace_xfs_reflink_convert_cow(ip, cmap);
- return xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
+ if (!error)
+ cmap->br_state = XFS_EXT_NORM;
+ return error;
out_trans_cancel:
xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index c3966b4c58ef..a22b2d19ef91 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -510,7 +510,7 @@ xfs_rui_item_recover(
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
struct xfs_btree_cur *rcur = NULL;
- struct xfs_mount *mp = lip->li_mountp;
+ struct xfs_mount *mp = lip->li_log->l_mp;
enum xfs_rmap_intent_type type;
xfs_exntst_t state;
int i;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e8f37bdc8354..d84714e4e46a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -735,6 +735,7 @@ xfs_fs_sync_fs(
int wait)
{
struct xfs_mount *mp = XFS_M(sb);
+ int error;
trace_xfs_fs_sync_fs(mp, __return_address);
@@ -744,7 +745,10 @@ xfs_fs_sync_fs(
if (!wait)
return 0;
- xfs_log_force(mp, XFS_LOG_SYNC);
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ if (error)
+ return error;
+
if (laptop_mode) {
/*
* The disk must be active because we're syncing.
@@ -1749,6 +1753,11 @@ xfs_remount_ro(
};
int error;
+ /* Flush all the dirty data to disk. */
+ error = sync_filesystem(mp->m_super);
+ if (error)
+ return error;
+
/*
* Cancel background eofb scanning so it cannot race with the final
* log force+buftarg wait and deadlock the remount.
@@ -1827,8 +1836,6 @@ xfs_fs_reconfigure(
if (error)
return error;
- sync_filesystem(mp->m_super);
-
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4a8076ef8cb4..b141ef78c755 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -933,7 +933,7 @@ DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
DECLARE_EVENT_CLASS(xfs_namespace_class,
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
TP_ARGS(dp, name),
TP_STRUCT__entry(
__field(dev_t, dev)
@@ -956,7 +956,7 @@ DECLARE_EVENT_CLASS(xfs_namespace_class,
#define DEFINE_NAMESPACE_EVENT(name) \
DEFINE_EVENT(xfs_namespace_class, name, \
- TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+ TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name), \
TP_ARGS(dp, name))
DEFINE_NAMESPACE_EVENT(xfs_remove);
DEFINE_NAMESPACE_EVENT(xfs_link);
@@ -1308,7 +1308,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__field(xfs_lsn_t, lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
@@ -1361,7 +1361,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class,
__field(xfs_lsn_t, new_lsn)
),
TP_fast_assign(
- __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->dev = lip->li_log->l_mp->m_super->s_dev;
__entry->lip = lip;
__entry->type = lip->li_type;
__entry->flags = lip->li_flags;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 59e2f9031b9f..917a69f0a6ff 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -648,7 +648,7 @@ xfs_trans_add_item(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
- ASSERT(lip->li_mountp == tp->t_mountp);
+ ASSERT(lip->li_log == tp->t_mountp->m_log);
ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
ASSERT(list_empty(&lip->li_trans));
ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
@@ -775,7 +775,7 @@ xfs_trans_committed_bulk(
* object into the AIL as we are in a shutdown situation.
*/
if (aborted) {
- ASSERT(xfs_is_shutdown(ailp->ail_mount));
+ ASSERT(xlog_is_shutdown(ailp->ail_log));
if (lip->li_ops->iop_unpin)
lip->li_ops->iop_unpin(lip, 1);
continue;
@@ -1210,3 +1210,89 @@ out_cancel:
xfs_trans_cancel(tp);
return error;
}
+
+/*
+ * Allocate an transaction, lock and join the directory and child inodes to it,
+ * and reserve quota for a directory update. If there isn't sufficient space,
+ * @dblocks will be set to zero for a reservationless directory update and
+ * @nospace_error will be set to a negative errno describing the space
+ * constraint we hit.
+ *
+ * The caller must ensure that the on-disk dquots attached to this inode have
+ * already been allocated and initialized. The ILOCKs will be dropped when the
+ * transaction is committed or cancelled.
+ */
+int
+xfs_trans_alloc_dir(
+ struct xfs_inode *dp,
+ struct xfs_trans_res *resv,
+ struct xfs_inode *ip,
+ unsigned int *dblocks,
+ struct xfs_trans **tpp,
+ int *nospace_error)
+{
+ struct xfs_trans *tp;
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int resblks;
+ bool retried = false;
+ int error;
+
+retry:
+ *nospace_error = 0;
+ resblks = *dblocks;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ if (error == -ENOSPC) {
+ *nospace_error = error;
+ resblks = 0;
+ error = xfs_trans_alloc(mp, resv, resblks, 0, 0, &tp);
+ }
+ if (error)
+ return error;
+
+ xfs_lock_two_inodes(dp, XFS_ILOCK_EXCL, ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ error = xfs_qm_dqattach_locked(dp, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error) {
+ /* Caller should have allocated the dquots! */
+ ASSERT(error != -ENOENT);
+ goto out_cancel;
+ }
+
+ if (resblks == 0)
+ goto done;
+
+ error = xfs_trans_reserve_quota_nblks(tp, dp, resblks, 0, false);
+ if (error == -EDQUOT || error == -ENOSPC) {
+ if (!retried) {
+ xfs_trans_cancel(tp);
+ xfs_blockgc_free_quota(dp, 0);
+ retried = true;
+ goto retry;
+ }
+
+ *nospace_error = error;
+ resblks = 0;
+ error = 0;
+ }
+ if (error)
+ goto out_cancel;
+
+done:
+ *tpp = tp;
+ *dblocks = resblks;
+ return 0;
+
+out_cancel:
+ xfs_trans_cancel(tp);
+ return error;
+}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index a487b264a9eb..de177842b951 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -8,6 +8,7 @@
/* kernel only transaction subsystem defines */
+struct xlog;
struct xfs_buf;
struct xfs_buftarg;
struct xfs_efd_log_item;
@@ -31,7 +32,7 @@ struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
- struct xfs_mount *li_mountp; /* ptr to fs mount */
+ struct xlog *li_log;
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
unsigned long li_flags; /* misc flags */
@@ -259,6 +260,9 @@ int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp,
struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force,
struct xfs_trans **tpp);
+int xfs_trans_alloc_dir(struct xfs_inode *dp, struct xfs_trans_res *resv,
+ struct xfs_inode *ip, unsigned int *dblocks,
+ struct xfs_trans **tpp, int *nospace_error);
static inline void
xfs_trans_set_context(
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 2a8c8dc54c95..c2ccb98c7bcd 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -398,7 +398,7 @@ xfsaild_push_item(
* If log item pinning is enabled, skip the push and track the item as
* pinned. This can help induce head-behind-tail conditions.
*/
- if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN))
+ if (XFS_TEST_ERROR(false, ailp->ail_log->l_mp, XFS_ERRTAG_LOG_ITEM_PIN))
return XFS_ITEM_PINNED;
/*
@@ -418,7 +418,7 @@ static long
xfsaild_push(
struct xfs_ail *ailp)
{
- xfs_mount_t *mp = ailp->ail_mount;
+ struct xfs_mount *mp = ailp->ail_log->l_mp;
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
@@ -443,15 +443,27 @@ xfsaild_push(
ailp->ail_log_flush = 0;
XFS_STATS_INC(mp, xs_push_ail_flush);
- xlog_cil_flush(mp->m_log);
+ xlog_cil_flush(ailp->ail_log);
}
spin_lock(&ailp->ail_lock);
- /* barrier matches the ail_target update in xfs_ail_push() */
- smp_rmb();
- target = ailp->ail_target;
- ailp->ail_target_prev = target;
+ /*
+ * If we have a sync push waiter, we always have to push till the AIL is
+ * empty. Update the target to point to the end of the AIL so that
+ * capture updates that occur after the sync push waiter has gone to
+ * sleep.
+ */
+ if (waitqueue_active(&ailp->ail_empty)) {
+ lip = xfs_ail_max(ailp);
+ if (lip)
+ target = lip->li_lsn;
+ } else {
+ /* barrier matches the ail_target update in xfs_ail_push() */
+ smp_rmb();
+ target = ailp->ail_target;
+ ailp->ail_target_prev = target;
+ }
/* we're done if the AIL is empty or our push has reached the end */
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
@@ -620,7 +632,7 @@ xfsaild(
* opportunity to release such buffers from the queue.
*/
ASSERT(list_empty(&ailp->ail_buf_list) ||
- xfs_is_shutdown(ailp->ail_mount));
+ xlog_is_shutdown(ailp->ail_log));
xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
@@ -683,7 +695,7 @@ xfs_ail_push(
struct xfs_log_item *lip;
lip = xfs_ail_min(ailp);
- if (!lip || xfs_is_shutdown(ailp->ail_mount) ||
+ if (!lip || xlog_is_shutdown(ailp->ail_log) ||
XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0)
return;
@@ -724,7 +736,6 @@ xfs_ail_push_all_sync(
spin_lock(&ailp->ail_lock);
while ((lip = xfs_ail_max(ailp)) != NULL) {
prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE);
- ailp->ail_target = lip->li_lsn;
wake_up_process(ailp->ail_task);
spin_unlock(&ailp->ail_lock);
schedule();
@@ -740,7 +751,7 @@ xfs_ail_update_finish(
struct xfs_ail *ailp,
xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
{
- struct xfs_mount *mp = ailp->ail_mount;
+ struct xlog *log = ailp->ail_log;
/* if the tail lsn hasn't changed, don't do updates or wakeups. */
if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
@@ -748,13 +759,13 @@ xfs_ail_update_finish(
return;
}
- if (!xfs_is_shutdown(mp))
- xlog_assign_tail_lsn_locked(mp);
+ if (!xlog_is_shutdown(log))
+ xlog_assign_tail_lsn_locked(log->l_mp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
spin_unlock(&ailp->ail_lock);
- xfs_log_space_wake(mp);
+ xfs_log_space_wake(log->l_mp);
}
/*
@@ -862,13 +873,13 @@ xfs_trans_ail_delete(
int shutdown_type)
{
struct xfs_ail *ailp = lip->li_ailp;
- struct xfs_mount *mp = ailp->ail_mount;
+ struct xfs_mount *mp = ailp->ail_log->l_mp;
xfs_lsn_t tail_lsn;
spin_lock(&ailp->ail_lock);
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
- if (shutdown_type && !xfs_is_shutdown(mp)) {
+ if (shutdown_type && !xlog_is_shutdown(ailp->ail_log)) {
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
@@ -893,7 +904,7 @@ xfs_trans_ail_init(
if (!ailp)
return -ENOMEM;
- ailp->ail_mount = mp;
+ ailp->ail_log = mp->m_log;
INIT_LIST_HEAD(&ailp->ail_head);
INIT_LIST_HEAD(&ailp->ail_cursors);
spin_lock_init(&ailp->ail_lock);
@@ -901,7 +912,7 @@ xfs_trans_ail_init(
init_waitqueue_head(&ailp->ail_empty);
ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->ail_mount->m_super->s_id);
+ mp->m_super->s_id);
if (IS_ERR(ailp->ail_task))
goto out_free_ailp;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3004aeac9110..f0d79a9050ba 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -6,6 +6,7 @@
#ifndef __XFS_TRANS_PRIV_H__
#define __XFS_TRANS_PRIV_H__
+struct xlog;
struct xfs_log_item;
struct xfs_mount;
struct xfs_trans;
@@ -50,7 +51,7 @@ struct xfs_ail_cursor {
* Eventually we need to drive the locking in here as well.
*/
struct xfs_ail {
- struct xfs_mount *ail_mount;
+ struct xlog *ail_log;
struct task_struct *ail_task;
struct list_head ail_head;
xfs_lsn_t ail_target;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index d331b52592a0..3614c7834007 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -185,9 +185,9 @@ static const struct address_space_operations zonefs_file_aops = {
.readahead = zonefs_readahead,
.writepage = zonefs_writepage,
.writepages = zonefs_writepages,
- .set_page_dirty = __set_page_dirty_nobuffers,
+ .dirty_folio = filemap_dirty_folio,
.releasepage = iomap_releasepage,
- .invalidatepage = iomap_invalidatepage,
+ .invalidate_folio = iomap_invalidate_folio,
.migratepage = iomap_migrate_page,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
@@ -695,7 +695,6 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
bio = bio_alloc(bdev, nr_pages,
REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
bio->bi_iter.bi_sector = zi->i_zsector;
- bio->bi_write_hint = iocb->ki_hint;
bio->bi_ioprio = iocb->ki_ioprio;
if (iocb->ki_flags & IOCB_DSYNC)
bio->bi_opf |= REQ_FUA;
@@ -1136,7 +1135,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb)
{
struct zonefs_inode_info *zi;
- zi = kmem_cache_alloc(zonefs_inode_cachep, GFP_KERNEL);
+ zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL);
if (!zi)
return NULL;