summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c2
-rw-r--r--fs/9p/vfs_inode.c8
-rw-r--r--fs/9p/vfs_inode_dotl.c12
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/adfs/dir_f.h5
-rw-r--r--fs/adfs/inode.c4
-rw-r--r--fs/affs/Kconfig1
-rw-r--r--fs/affs/amigaffs.c6
-rw-r--r--fs/affs/file.c91
-rw-r--r--fs/affs/inode.c16
-rw-r--r--fs/affs/namei.c20
-rw-r--r--fs/affs/symlink.c12
-rw-r--r--fs/afs/dynroot.c2
-rw-r--r--fs/afs/inode.c8
-rw-r--r--fs/afs/internal.h2
-rw-r--r--fs/aio.c24
-rw-r--r--fs/attr.c22
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs/root.c6
-rw-r--r--fs/autofs/waitq.c5
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/befs/Kconfig1
-rw-r--r--fs/befs/linuxvfs.c2
-rw-r--r--fs/bfs/Kconfig1
-rw-r--r--fs/bfs/dir.c16
-rw-r--r--fs/bfs/inode.c5
-rw-r--r--fs/binfmt_elf_fdpic.c43
-rw-r--r--fs/binfmt_misc.c3
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/file.c27
-rw-r--r--fs/btrfs/inode.c66
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/misc.h2
-rw-r--r--fs/btrfs/reflink.c3
-rw-r--r--fs/btrfs/transaction.c3
-rw-r--r--fs/btrfs/tree-log.c4
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/buffer.c96
-rw-r--r--fs/cachefiles/io.c16
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/ceph/Makefile1
-rw-r--r--fs/ceph/acl.c6
-rw-r--r--fs/ceph/addr.c196
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c237
-rw-r--r--fs/ceph/crypto.c671
-rw-r--r--fs/ceph/crypto.h288
-rw-r--r--fs/ceph/dir.c194
-rw-r--r--fs/ceph/export.c44
-rw-r--r--fs/ceph/file.c604
-rw-r--r--fs/ceph/inode.c647
-rw-r--r--fs/ceph/ioctl.c127
-rw-r--r--fs/ceph/mds_client.c676
-rw-r--r--fs/ceph/mds_client.h35
-rw-r--r--fs/ceph/quota.c14
-rw-r--r--fs/ceph/snap.c12
-rw-r--r--fs/ceph/super.c191
-rw-r--r--fs/ceph/super.h49
-rw-r--r--fs/ceph/xattr.c32
-rw-r--r--fs/coda/coda_linux.c3
-rw-r--r--fs/coda/dir.c2
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/coda/inode.c5
-rw-r--r--fs/configfs/inode.c7
-rw-r--r--fs/cramfs/inode.c11
-rw-r--r--fs/dax.c33
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/debugfs/file.c48
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c16
-rw-r--r--fs/dlm/config.c2
-rw-r--r--fs/dlm/debug_fs.c101
-rw-r--r--fs/dlm/dir.c14
-rw-r--r--fs/dlm/dir.h6
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c120
-rw-r--r--fs/dlm/lock.h16
-rw-r--r--fs/dlm/lowcomms.c1
-rw-r--r--fs/dlm/member.c15
-rw-r--r--fs/dlm/member.h2
-rw-r--r--fs/dlm/midcomms.c304
-rw-r--r--fs/dlm/midcomms.h1
-rw-r--r--fs/dlm/plock.c178
-rw-r--r--fs/dlm/rcom.c102
-rw-r--r--fs/dlm/rcom.h15
-rw-r--r--fs/dlm/recover.c60
-rw-r--r--fs/dlm/recover.h14
-rw-r--r--fs/dlm/recoverd.c16
-rw-r--r--fs/dlm/requestqueue.c3
-rw-r--r--fs/dlm/requestqueue.h3
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/crypto.c8
-rw-r--r--fs/ecryptfs/inode.c7
-rw-r--r--fs/ecryptfs/mmap.c5
-rw-r--r--fs/ecryptfs/read_write.c12
-rw-r--r--fs/efivarfs/file.c2
-rw-r--r--fs/efivarfs/inode.c2
-rw-r--r--fs/efivarfs/super.c14
-rw-r--r--fs/efs/Kconfig1
-rw-r--r--fs/efs/efs.h5
-rw-r--r--fs/efs/inode.c4
-rw-r--r--fs/erofs/Kconfig16
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h2
-rw-r--r--fs/erofs/data.c6
-rw-r--r--fs/erofs/decompressor.c6
-rw-r--r--fs/erofs/decompressor_deflate.c247
-rw-r--r--fs/erofs/decompressor_lzma.c5
-rw-r--r--fs/erofs/erofs_fs.h17
-rw-r--r--fs/erofs/inode.c14
-rw-r--r--fs/erofs/internal.h23
-rw-r--r--fs/erofs/super.c46
-rw-r--r--fs/erofs/xattr.c14
-rw-r--r--fs/erofs/zdata.c274
-rw-r--r--fs/erofs/zmap.c5
-rw-r--r--fs/eventfd.c2
-rw-r--r--fs/eventpoll.c12
-rw-r--r--fs/exec.c5
-rw-r--r--fs/exfat/Kconfig1
-rw-r--r--fs/exfat/exfat_fs.h2
-rw-r--r--fs/exfat/file.c6
-rw-r--r--fs/exfat/inode.c6
-rw-r--r--fs/exfat/namei.c26
-rw-r--r--fs/exfat/super.c42
-rw-r--r--fs/exportfs/expfs.c1
-rw-r--r--fs/ext2/Kconfig1
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/balloc.c138
-rw-r--r--fs/ext2/dir.c6
-rw-r--r--fs/ext2/ext2.h14
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c36
-rw-r--r--fs/ext2/ioctl.c4
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c9
-rw-r--r--fs/ext4/Kconfig1
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c15
-rw-r--r--fs/ext4/block_validity.c8
-rw-r--r--fs/ext4/crypto.c4
-rw-r--r--fs/ext4/ext4.h124
-rw-r--r--fs/ext4/ext4_jbd2.c8
-rw-r--r--fs/ext4/extents.c12
-rw-r--r--fs/ext4/extents_status.c44
-rw-r--r--fs/ext4/file.c49
-rw-r--r--fs/ext4/fsync.c9
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c10
-rw-r--r--fs/ext4/inline.c6
-rw-r--r--fs/ext4/inode-test.c6
-rw-r--r--fs/ext4/inode.c148
-rw-r--r--fs/ext4/ioctl.c11
-rw-r--r--fs/ext4/mballoc.c254
-rw-r--r--fs/ext4/mballoc.h14
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c19
-rw-r--r--fs/ext4/namei.c69
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/super.c377
-rw-r--r--fs/ext4/xattr.c8
-rw-r--r--fs/f2fs/Kconfig1
-rw-r--r--fs/f2fs/checkpoint.c2
-rw-r--r--fs/f2fs/compress.c16
-rw-r--r--fs/f2fs/data.c14
-rw-r--r--fs/f2fs/debug.c33
-rw-r--r--fs/f2fs/dir.c8
-rw-r--r--fs/f2fs/f2fs.h102
-rw-r--r--fs/f2fs/file.c84
-rw-r--r--fs/f2fs/gc.c26
-rw-r--r--fs/f2fs/inline.c5
-rw-r--r--fs/f2fs/inode.c45
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/recovery.c5
-rw-r--r--fs/f2fs/segment.c89
-rw-r--r--fs/f2fs/super.c43
-rw-r--r--fs/f2fs/sysfs.c18
-rw-r--r--fs/f2fs/xattr.c6
-rw-r--r--fs/fat/Kconfig1
-rw-r--r--fs/fat/fat.h3
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/misc.c10
-rw-r--r--fs/fcntl.c29
-rw-r--r--fs/file.c30
-rw-r--r--fs/file_table.c5
-rw-r--r--fs/freevxfs/Kconfig1
-rw-r--r--fs/freevxfs/vxfs_inode.c3
-rw-r--r--fs/fs-writeback.c15
-rw-r--r--fs/fs_context.c70
-rw-r--r--fs/fs_struct.c4
-rw-r--r--fs/fsopen.c106
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/dax.c20
-rw-r--r--fs/fuse/dir.c169
-rw-r--r--fs/fuse/file.c115
-rw-r--r--fs/fuse/fuse_i.h18
-rw-r--r--fs/fuse/inode.c50
-rw-r--r--fs/fuse/readdir.c16
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/aops.c9
-rw-r--r--fs/gfs2/bmap.c15
-rw-r--r--fs/gfs2/dir.c15
-rw-r--r--fs/gfs2/file.c27
-rw-r--r--fs/gfs2/glock.c51
-rw-r--r--fs/gfs2/glock.h9
-rw-r--r--fs/gfs2/glops.c15
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c30
-rw-r--r--fs/gfs2/lock_dlm.c5
-rw-r--r--fs/gfs2/log.c69
-rw-r--r--fs/gfs2/lops.c7
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/ops_fstype.c42
-rw-r--r--fs/gfs2/quota.c370
-rw-r--r--fs/gfs2/quota.h3
-rw-r--r--fs/gfs2/recovery.c4
-rw-r--r--fs/gfs2/recovery.h2
-rw-r--r--fs/gfs2/super.c44
-rw-r--r--fs/gfs2/super.h1
-rw-r--r--fs/gfs2/sys.c16
-rw-r--r--fs/gfs2/util.c34
-rw-r--r--fs/gfs2/xattr.c8
-rw-r--r--fs/hfs/Kconfig1
-rw-r--r--fs/hfs/catalog.c8
-rw-r--r--fs/hfs/dir.c2
-rw-r--r--fs/hfs/inode.c13
-rw-r--r--fs/hfs/sysdep.c4
-rw-r--r--fs/hfsplus/Kconfig1
-rw-r--r--fs/hfsplus/catalog.c8
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hfsplus/extents.c6
-rw-r--r--fs/hfsplus/inode.c18
-rw-r--r--fs/hostfs/hostfs_kern.c3
-rw-r--r--fs/hpfs/Kconfig1
-rw-r--r--fs/hpfs/dir.c8
-rw-r--r--fs/hpfs/inode.c6
-rw-r--r--fs/hpfs/namei.c29
-rw-r--r--fs/hpfs/super.c5
-rw-r--r--fs/hugetlbfs/inode.c69
-rw-r--r--fs/inode.c159
-rw-r--r--fs/internal.h10
-rw-r--r--fs/ioctl.c18
-rw-r--r--fs/iomap/buffered-io.c501
-rw-r--r--fs/iomap/direct-io.c163
-rw-r--r--fs/isofs/Kconfig1
-rw-r--r--fs/isofs/inode.c9
-rw-r--r--fs/isofs/rock.c16
-rw-r--r--fs/jbd2/checkpoint.c34
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/journal.c521
-rw-r--r--fs/jbd2/recovery.c12
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/jffs2/dir.c24
-rw-r--r--fs/jffs2/file.c3
-rw-r--r--fs/jffs2/fs.c10
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/Kconfig2
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/acl.c2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_dmap.c1
-rw-r--r--fs/jfs/jfs_extent.c7
-rw-r--r--fs/jfs/jfs_imap.c9
-rw-r--r--fs/jfs/jfs_inode.c4
-rw-r--r--fs/jfs/jfs_unicode.h17
-rw-r--r--fs/jfs/jfs_uniupr.c121
-rw-r--r--fs/jfs/namei.c26
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/xattr.c2
-rw-r--r--fs/kernel_read_file.c12
-rw-r--r--fs/kernfs/dir.c6
-rw-r--r--fs/kernfs/inode.c53
-rw-r--r--fs/kernfs/mount.c13
-rw-r--r--fs/libfs.c364
-rw-r--r--fs/lockd/mon.c3
-rw-r--r--fs/lockd/svc.c52
-rw-r--r--fs/lockd/svclock.c18
-rw-r--r--fs/locks.c54
-rw-r--r--fs/minix/Kconfig1
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/dir.c6
-rw-r--r--fs/minix/inode.c12
-rw-r--r--fs/minix/itree_common.c4
-rw-r--r--fs/minix/namei.c6
-rw-r--r--fs/namei.c14
-rw-r--r--fs/netfs/buffered_read.c6
-rw-r--r--fs/nfs/Kconfig6
-rw-r--r--fs/nfs/blocklayout/dev.c4
-rw-r--r--fs/nfs/callback.c23
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/dir.c15
-rw-r--r--fs/nfs/direct.c174
-rw-r--r--fs/nfs/dns_resolve.c12
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c18
-rw-r--r--fs/nfs/fscache.c3
-rw-r--r--fs/nfs/fscache.h4
-rw-r--r--fs/nfs/inode.c22
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/namespace.c3
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3client.c3
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c13
-rw-r--r--fs/nfs/nfs42xdr.c17
-rw-r--r--fs/nfs/nfs4_fs.h2
-rw-r--r--fs/nfs/nfs4client.c9
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c43
-rw-r--r--fs/nfs/nfs4state.c45
-rw-r--r--fs/nfs/pnfs.c33
-rw-r--r--fs/nfs/pnfs_dev.c2
-rw-r--r--fs/nfs/pnfs_nfs.c5
-rw-r--r--fs/nfs/read.c10
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/sysfs.c4
-rw-r--r--fs/nfs/write.c27
-rw-r--r--fs/nfsd/blocklayoutxdr.c9
-rw-r--r--fs/nfsd/cache.h8
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c9
-rw-r--r--fs/nfsd/nfs3proc.c4
-rw-r--r--fs/nfsd/nfs4acl.c34
-rw-r--r--fs/nfsd/nfs4proc.c55
-rw-r--r--fs/nfsd/nfs4state.c164
-rw-r--r--fs/nfsd/nfs4xdr.c43
-rw-r--r--fs/nfsd/nfscache.c204
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/nfsd.h7
-rw-r--r--fs/nfsd/nfsfh.c26
-rw-r--r--fs/nfsd/nfsfh.h6
-rw-r--r--fs/nfsd/nfssvc.c116
-rw-r--r--fs/nfsd/state.h3
-rw-r--r--fs/nfsd/stats.c2
-rw-r--r--fs/nfsd/stats.h7
-rw-r--r--fs/nfsd/trace.h27
-rw-r--r--fs/nfsd/vfs.c54
-rw-r--r--fs/nfsd/xdr4.h11
-rw-r--r--fs/nilfs2/Kconfig1
-rw-r--r--fs/nilfs2/alloc.c3
-rw-r--r--fs/nilfs2/dir.c6
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/gcinode.c6
-rw-r--r--fs/nilfs2/inode.c19
-rw-r--r--fs/nilfs2/ioctl.c2
-rw-r--r--fs/nilfs2/namei.c8
-rw-r--r--fs/nilfs2/segment.c5
-rw-r--r--fs/nilfs2/super.c81
-rw-r--r--fs/nls/Kconfig3
-rw-r--r--fs/nls/Makefile1
-rw-r--r--fs/nls/nls_ucs2_data.h15
-rw-r--r--fs/nls/nls_ucs2_utils.c (renamed from fs/smb/server/uniupr.h)156
-rw-r--r--fs/nls/nls_ucs2_utils.h285
-rw-r--r--fs/notify/dnotify/dnotify.c4
-rw-r--r--fs/notify/fanotify/fanotify_user.c25
-rw-r--r--fs/nsfs.c2
-rw-r--r--fs/ntfs/Kconfig1
-rw-r--r--fs/ntfs/inode.c15
-rw-r--r--fs/ntfs/mft.c3
-rw-r--r--fs/ntfs3/Kconfig1
-rw-r--r--fs/ntfs3/attrib.c12
-rw-r--r--fs/ntfs3/attrlist.c15
-rw-r--r--fs/ntfs3/bitmap.c4
-rw-r--r--fs/ntfs3/dir.c6
-rw-r--r--fs/ntfs3/file.c12
-rw-r--r--fs/ntfs3/frecord.c11
-rw-r--r--fs/ntfs3/fslog.c6
-rw-r--r--fs/ntfs3/fsntfs.c19
-rw-r--r--fs/ntfs3/index.c3
-rw-r--r--fs/ntfs3/inode.c25
-rw-r--r--fs/ntfs3/namei.c13
-rw-r--r--fs/ntfs3/ntfs.h2
-rw-r--r--fs/ntfs3/ntfs_fs.h4
-rw-r--r--fs/ntfs3/record.c74
-rw-r--r--fs/ntfs3/super.c154
-rw-r--r--fs/ntfs3/xattr.c11
-rw-r--r--fs/ocfs2/Kconfig1
-rw-r--r--fs/ocfs2/acl.c6
-rw-r--r--fs/ocfs2/alloc.c6
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/netdebug.c40
-rw-r--r--fs/ocfs2/cluster/quorum.c26
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmglue.c7
-rw-r--r--fs/ocfs2/file.c25
-rw-r--r--fs/ocfs2/inode.c12
-rw-r--r--fs/ocfs2/journal.c25
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/move_extents.c6
-rw-r--r--fs/ocfs2/namei.c25
-rw-r--r--fs/ocfs2/refcounttree.c14
-rw-r--r--fs/ocfs2/stack_user.c13
-rw-r--r--fs/ocfs2/super.c3
-rw-r--r--fs/ocfs2/xattr.c6
-rw-r--r--fs/omfs/Kconfig1
-rw-r--r--fs/omfs/dir.c4
-rw-r--r--fs/omfs/file.c12
-rw-r--r--fs/omfs/inode.c9
-rw-r--r--fs/omfs/omfs_fs.h2
-rw-r--r--fs/open.c54
-rw-r--r--fs/openpromfs/inode.c5
-rw-r--r--fs/orangefs/inode.c7
-rw-r--r--fs/orangefs/namei.c2
-rw-r--r--fs/orangefs/orangefs-kernel.h2
-rw-r--r--fs/orangefs/orangefs-utils.c6
-rw-r--r--fs/overlayfs/Kconfig9
-rw-r--r--fs/overlayfs/copy_up.c59
-rw-r--r--fs/overlayfs/export.c36
-rw-r--r--fs/overlayfs/file.c42
-rw-r--r--fs/overlayfs/inode.c10
-rw-r--r--fs/overlayfs/namei.c89
-rw-r--r--fs/overlayfs/overlayfs.h68
-rw-r--r--fs/overlayfs/ovl_entry.h19
-rw-r--r--fs/overlayfs/params.c222
-rw-r--r--fs/overlayfs/super.c67
-rw-r--r--fs/overlayfs/util.c235
-rw-r--r--fs/pipe.c11
-rw-r--r--fs/posix_acl.c2
-rw-r--r--fs/proc/array.c6
-rw-r--r--fs/proc/base.c15
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/generic.c2
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/meminfo.c13
-rw-r--r--fs/proc/proc_net.c3
-rw-r--r--fs/proc/proc_sysctl.c94
-rw-r--r--fs/proc/root.c3
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/task_mmu.c43
-rw-r--r--fs/proc/task_nommu.c79
-rw-r--r--fs/proc/thread_self.c2
-rw-r--r--fs/pstore/Kconfig100
-rw-r--r--fs/pstore/inode.c6
-rw-r--r--fs/pstore/platform.c371
-rw-r--r--fs/pstore/ram.c11
-rw-r--r--fs/pstore/ram_core.c17
-rw-r--r--fs/qnx4/Kconfig1
-rw-r--r--fs/qnx4/inode.c3
-rw-r--r--fs/qnx6/Kconfig1
-rw-r--r--fs/qnx6/inode.c3
-rw-r--r--fs/quota/dquot.c275
-rw-r--r--fs/ramfs/inode.c6
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/Kconfig1
-rw-r--r--fs/reiserfs/fix_node.c5
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/ioctl.c4
-rw-r--r--fs/reiserfs/journal.c4
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/reiserfs.h8
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c5
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/romfs/Kconfig1
-rw-r--r--fs/romfs/super.c13
-rw-r--r--fs/smb/client/Kconfig1
-rw-r--r--fs/smb/client/Makefile5
-rw-r--r--fs/smb/client/cached_dir.c158
-rw-r--r--fs/smb/client/cached_dir.h4
-rw-r--r--fs/smb/client/cifs_debug.c2
-rw-r--r--fs/smb/client/cifs_unicode.c1
-rw-r--r--fs/smb/client/cifs_unicode.h330
-rw-r--r--fs/smb/client/cifs_uniupr.h239
-rw-r--r--fs/smb/client/cifsfs.c16
-rw-r--r--fs/smb/client/cifsfs.h15
-rw-r--r--fs/smb/client/cifsglob.h77
-rw-r--r--fs/smb/client/cifsproto.h11
-rw-r--r--fs/smb/client/connect.c50
-rw-r--r--fs/smb/client/dfs.c271
-rw-r--r--fs/smb/client/dfs.h141
-rw-r--r--fs/smb/client/dfs_cache.c10
-rw-r--r--fs/smb/client/dfs_cache.h12
-rw-r--r--fs/smb/client/dir.c4
-rw-r--r--fs/smb/client/file.c4
-rw-r--r--fs/smb/client/fs_context.c12
-rw-r--r--fs/smb/client/fs_context.h4
-rw-r--r--fs/smb/client/fscache.c4
-rw-r--r--fs/smb/client/fscache.h5
-rw-r--r--fs/smb/client/inode.c516
-rw-r--r--fs/smb/client/misc.c16
-rw-r--r--fs/smb/client/namespace.c (renamed from fs/smb/client/cifs_dfs_ref.c)113
-rw-r--r--fs/smb/client/readdir.c23
-rw-r--r--fs/smb/client/sess.c72
-rw-r--r--fs/smb/client/smb1ops.c26
-rw-r--r--fs/smb/client/smb2inode.c206
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2misc.c6
-rw-r--r--fs/smb/client/smb2ops.c308
-rw-r--r--fs/smb/client/smb2pdu.c34
-rw-r--r--fs/smb/client/smb2proto.h17
-rw-r--r--fs/smb/client/smb2transport.c4
-rw-r--r--fs/smb/client/smbdirect.c9
-rw-r--r--fs/smb/client/trace.h4
-rw-r--r--fs/smb/client/transport.c65
-rw-r--r--fs/smb/common/smb2pdu.h24
-rw-r--r--fs/smb/server/Kconfig3
-rw-r--r--fs/smb/server/asn1.c4
-rw-r--r--fs/smb/server/auth.c14
-rw-r--r--fs/smb/server/connection.c58
-rw-r--r--fs/smb/server/connection.h3
-rw-r--r--fs/smb/server/ksmbd_work.c93
-rw-r--r--fs/smb/server/ksmbd_work.h34
-rw-r--r--fs/smb/server/mgmt/share_config.h29
-rw-r--r--fs/smb/server/mgmt/tree_connect.c42
-rw-r--r--fs/smb/server/mgmt/tree_connect.h11
-rw-r--r--fs/smb/server/mgmt/user_session.c11
-rw-r--r--fs/smb/server/mgmt/user_session.h1
-rw-r--r--fs/smb/server/oplock.c31
-rw-r--r--fs/smb/server/server.c16
-rw-r--r--fs/smb/server/smb2misc.c4
-rw-r--r--fs/smb/server/smb2pdu.c630
-rw-r--r--fs/smb/server/smb2pdu.h2
-rw-r--r--fs/smb/server/smb_common.c13
-rw-r--r--fs/smb/server/smbacl.c1
-rw-r--r--fs/smb/server/transport_rdma.c29
-rw-r--r--fs/smb/server/unicode.c1
-rw-r--r--fs/smb/server/unicode.h325
-rw-r--r--fs/smb/server/vfs.c7
-rw-r--r--fs/smb/server/vfs.h4
-rw-r--r--fs/smb/server/vfs_cache.c30
-rw-r--r--fs/smb/server/vfs_cache.h9
-rw-r--r--fs/splice.c67
-rw-r--r--fs/squashfs/inode.c2
-rw-r--r--fs/stack.c2
-rw-r--r--fs/stat.c49
-rw-r--r--fs/super.c888
-rw-r--r--fs/sysv/Kconfig1
-rw-r--r--fs/sysv/dir.c6
-rw-r--r--fs/sysv/ialloc.c2
-rw-r--r--fs/sysv/inode.c5
-rw-r--r--fs/sysv/itree.c7
-rw-r--r--fs/sysv/namei.c6
-rw-r--r--fs/tracefs/Makefile1
-rw-r--r--fs/tracefs/event_inode.c905
-rw-r--r--fs/tracefs/inode.c162
-rw-r--r--fs/tracefs/internal.h30
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/dir.c41
-rw-r--r--fs/ubifs/file.c31
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/super.c4
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/Kconfig1
-rw-r--r--fs/udf/directory.c2
-rw-r--r--fs/udf/file.c8
-rw-r--r--fs/udf/ialloc.c2
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/namei.c24
-rw-r--r--fs/udf/symlink.c2
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/ufs/dir.c6
-rw-r--r--fs/ufs/ialloc.c2
-rw-r--r--fs/ufs/inode.c23
-rw-r--r--fs/ufs/namei.c8
-rw-r--r--fs/ufs/util.h6
-rw-r--r--fs/userfaultfd.c140
-rw-r--r--fs/vboxsf/utils.c6
-rw-r--r--fs/verity/fsverity_private.h12
-rw-r--r--fs/verity/hash_algs.c8
-rw-r--r--fs/verity/init.c56
-rw-r--r--fs/verity/open.c18
-rw-r--r--fs/verity/signature.c77
-rw-r--r--fs/verity/verify.c11
-rw-r--r--fs/xattr.c83
-rw-r--r--fs/xfs/Kconfig18
-rw-r--r--fs/xfs/Makefile11
-rw-r--r--fs/xfs/libxfs/xfs_ag.c6
-rw-r--r--fs/xfs/libxfs/xfs_fs.h6
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c5
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h22
-rw-r--r--fs/xfs/libxfs/xfs_sb.c3
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/scrub/agheader_repair.c101
-rw-r--r--fs/xfs/scrub/bitmap.c78
-rw-r--r--fs/xfs/scrub/bitmap.h10
-rw-r--r--fs/xfs/scrub/bmap.c42
-rw-r--r--fs/xfs/scrub/common.c215
-rw-r--r--fs/xfs/scrub/common.h39
-rw-r--r--fs/xfs/scrub/fscounters.c188
-rw-r--r--fs/xfs/scrub/health.c10
-rw-r--r--fs/xfs/scrub/ialloc.c3
-rw-r--r--fs/xfs/scrub/inode.c11
-rw-r--r--fs/xfs/scrub/parent.c4
-rw-r--r--fs/xfs/scrub/quota.c15
-rw-r--r--fs/xfs/scrub/reap.c498
-rw-r--r--fs/xfs/scrub/reap.h12
-rw-r--r--fs/xfs/scrub/repair.c377
-rw-r--r--fs/xfs/scrub/repair.h25
-rw-r--r--fs/xfs/scrub/rtbitmap.c48
-rw-r--r--fs/xfs/scrub/rtsummary.c264
-rw-r--r--fs/xfs/scrub/scrub.c52
-rw-r--r--fs/xfs/scrub/scrub.h5
-rw-r--r--fs/xfs/scrub/stats.c408
-rw-r--r--fs/xfs/scrub/stats.h59
-rw-r--r--fs/xfs/scrub/trace.c4
-rw-r--r--fs/xfs/scrub/trace.h417
-rw-r--r--fs/xfs/scrub/xfarray.c1083
-rw-r--r--fs/xfs/scrub/xfarray.h141
-rw-r--r--fs/xfs/scrub/xfile.c419
-rw-r--r--fs/xfs/scrub/xfile.h77
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_attr_inactive.c1
-rw-r--r--fs/xfs/xfs_attr_item.c7
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c6
-rw-r--r--fs/xfs/xfs_buf.c16
-rw-r--r--fs/xfs/xfs_buf.h13
-rw-r--r--fs/xfs/xfs_discard.c266
-rw-r--r--fs/xfs/xfs_discard.h6
-rw-r--r--fs/xfs/xfs_dquot.c2
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_extent_busy.c35
-rw-r--r--fs/xfs/xfs_extent_busy.h24
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_file.c24
-rw-r--r--fs/xfs/xfs_fsmap.c25
-rw-r--r--fs/xfs/xfs_icache.c118
-rw-r--r--fs/xfs/xfs_icache.h5
-rw-r--r--fs/xfs/xfs_inode.c234
-rw-r--r--fs/xfs/xfs_inode.h34
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_iops.c26
-rw-r--r--fs/xfs/xfs_itable.c15
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_log.c17
-rw-r--r--fs/xfs/xfs_log_cil.c145
-rw-r--r--fs/xfs/xfs_log_priv.h19
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_mount.c9
-rw-r--r--fs/xfs/xfs_mount.h21
-rw-r--r--fs/xfs/xfs_notify_failure.c6
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_refcount_item.c6
-rw-r--r--fs/xfs/xfs_rmap_item.c6
-rw-r--r--fs/xfs/xfs_super.c275
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_trace.h68
-rw-r--r--fs/xfs/xfs_xattr.c11
-rw-r--r--fs/zonefs/file.c2
-rw-r--r--fs/zonefs/super.c8
655 files changed, 20295 insertions, 10030 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index cebba4eaa0b5..12c0ae29f185 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
&path, sizeof(path),
&version, sizeof(version),
i_size_read(&v9inode->netfs.inode));
+ if (v9inode->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9fs_inode_cookie(v9inode));
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 950cf61f118b..0d28ecf668d0 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
inode->i_rdev = rdev;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_mapping->a_ops = &v9fs_addr_operations;
inode->i_private = NULL;
@@ -1011,7 +1011,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
} else if (v9ses->cache & CACHE_WRITEBACK) {
if (S_ISREG(inode->i_mode)) {
@@ -1032,7 +1032,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path,
return PTR_ERR(st);
v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0);
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
p9stat_free(st);
kfree(st);
@@ -1152,7 +1152,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
inode->i_atime.tv_sec = stat->atime;
inode->i_mtime.tv_sec = stat->mtime;
- inode->i_ctime.tv_sec = stat->mtime;
+ inode_set_ctime(inode, stat->mtime, 0);
inode->i_uid = v9ses->dfltuid;
inode->i_gid = v9ses->dfltgid;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 14510872ecc3..1312f68965ac 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -450,7 +450,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
} else if (v9ses->cache) {
if (S_ISREG(inode->i_mode)) {
@@ -475,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap,
return PTR_ERR(st);
v9fs_stat2inode_dotl(st, d_inode(dentry), 0);
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
/* Change block size to what the server returned */
stat->blksize = st->st_blksize;
@@ -645,8 +645,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_atime.tv_nsec = stat->st_atime_nsec;
inode->i_mtime.tv_sec = stat->st_mtime_sec;
inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
- inode->i_ctime.tv_sec = stat->st_ctime_sec;
- inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+ inode_set_ctime(inode, stat->st_ctime_sec,
+ stat->st_ctime_nsec);
inode->i_uid = stat->st_uid;
inode->i_gid = stat->st_gid;
set_nlink(inode, stat->st_nlink);
@@ -668,8 +668,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_mtime.tv_nsec = stat->st_mtime_nsec;
}
if (stat->st_result_mask & P9_STATS_CTIME) {
- inode->i_ctime.tv_sec = stat->st_ctime_sec;
- inode->i_ctime.tv_nsec = stat->st_ctime_nsec;
+ inode_set_ctime(inode, stat->st_ctime_sec,
+ stat->st_ctime_nsec);
}
if (stat->st_result_mask & P9_STATS_UID)
inode->i_uid = stat->st_uid;
diff --git a/fs/Kconfig b/fs/Kconfig
index 18d034ec7953..aa7e03cc1941 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER
config FS_IOMAP
bool
+config BUFFER_HEAD
+ bool
+
# old blockdev_direct_IO implementation. Use iomap for new code instead
config LEGACY_DIRECT_IO
+ depends on BUFFER_HEAD
bool
if BLOCK
@@ -169,6 +173,7 @@ source "fs/sysfs/Kconfig"
config TMPFS
bool "Tmpfs virtual memory file system support (former shm fs)"
depends on SHMEM
+ select MEMFD_CREATE
help
Tmpfs is a file system which keeps all files in virtual memory.
@@ -205,8 +210,8 @@ config TMPFS_XATTR
Extended attributes are name:value pairs associated with inodes by
the kernel or by users (see the attr(5) manual page for details).
- Currently this enables support for the trusted.* and
- security.* namespaces.
+ This enables support for the trusted.*, security.* and user.*
+ namespaces.
You need this for POSIX ACL support on tmpfs.
@@ -233,6 +238,18 @@ config TMPFS_INODE64
If unsure, say N.
+config TMPFS_QUOTA
+ bool "Tmpfs quota support"
+ depends on TMPFS
+ select QUOTA
+ help
+ Quota support allows to set per user and group limits for tmpfs
+ usage. Say Y to enable quota support. Once enabled you can control
+ user and group quota enforcement with quota, usrquota and grpquota
+ mount options.
+
+ If unsure, say N.
+
config ARCH_SUPPORTS_HUGETLBFS
def_bool n
@@ -240,6 +257,7 @@ config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
+ select MEMFD_CREATE
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
@@ -252,7 +270,7 @@ config HUGETLB_PAGE
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
- depends on ARCH_WANT_OPTIMIZE_VMEMMAP
+ depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
@@ -264,9 +282,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
(boot command line) or hugetlb_optimize_vmemmap (sysctl).
-config MEMFD_CREATE
- def_bool TMPFS || HUGETLBFS
-
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 93539aac0e5b..f5693164ca9a 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y if !BINFMT_ELF
- depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU)
+ depends on ARM || ((M68K || RISCV || SUPERH || XTENSA) && !MMU)
select ELFCORE
help
ELF FDPIC binaries are based on ELF, but allow the individual load
diff --git a/fs/Makefile b/fs/Makefile
index e513aaee0603..f9541f40be4e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o
-obj-$(CONFIG_BLOCK) += buffer.o mpage.o
+obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o
obj-y += notify/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 44738fed6625..1b97058f0c4a 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -2,6 +2,7 @@
config ADFS_FS
tristate "ADFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h
index a5393e6cf9f4..4e6c53d59ebd 100644
--- a/fs/adfs/dir_f.h
+++ b/fs/adfs/dir_f.h
@@ -58,9 +58,4 @@ struct adfs_newdirtail {
__u8 dircheckbyte;
} __attribute__((packed));
-union adfs_dirtail {
- struct adfs_olddirtail old;
- struct adfs_newdirtail new;
-};
-
#endif
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index c3ac613d0975..20963002578a 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -270,7 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
inode->i_mode = adfs_atts2mode(sb, inode);
adfs_adfs2unix_time(&inode->i_mtime, inode);
inode->i_atime = inode->i_mtime;
- inode->i_ctime = inode->i_mtime;
+ inode_set_ctime_to_ts(inode, inode->i_mtime);
if (S_ISDIR(inode->i_mode)) {
inode->i_op = &adfs_dir_inode_operations;
@@ -331,7 +331,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode);
inode->i_mode = adfs_atts2mode(sb, inode);
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index 962b86374e1c..1ae432d266c3 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -2,6 +2,7 @@
config AFFS_FS
tristate "Amiga FFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select LEGACY_DIRECT_IO
help
The Fast File System (FFS) is the common file system used on hard
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 29f11e10a7c7..7ba93efc1143 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
mark_buffer_dirty_inode(dir_bh, dir);
affs_brelse(dir_bh);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
inode_inc_iversion(dir);
mark_inode_dirty(dir);
@@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
affs_brelse(bh);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
inode_inc_iversion(dir);
mark_inode_dirty(dir);
@@ -315,7 +315,7 @@ affs_remove_header(struct dentry *dentry)
else
clear_nlink(inode);
affs_unlock_link(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
done:
diff --git a/fs/affs/file.c b/fs/affs/file.c
index e43f2f007ac1..04c018e19602 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -15,6 +15,7 @@
#include <linux/uio.h>
#include <linux/blkdev.h>
+#include <linux/mpage.h>
#include "affs.h"
static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
@@ -370,9 +371,10 @@ err_alloc:
return -ENOSPC;
}
-static int affs_writepage(struct page *page, struct writeback_control *wbc)
+static int affs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, affs_get_block, wbc);
+ return mpage_writepages(mapping, wbc, affs_get_block);
}
static int affs_read_folio(struct file *file, struct folio *folio)
@@ -456,10 +458,11 @@ const struct address_space_operations affs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = affs_read_folio,
- .writepage = affs_writepage,
+ .writepages = affs_writepages,
.write_begin = affs_write_begin,
.write_end = affs_write_end,
.direct_IO = affs_direct_IO,
+ .migrate_folio = buffer_migrate_folio,
.bmap = _affs_bmap
};
@@ -520,21 +523,20 @@ affs_getemptyblk_ino(struct inode *inode, int block)
return ERR_PTR(err);
}
-static int
-affs_do_readpage_ofs(struct page *page, unsigned to, int create)
+static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
- unsigned pos = 0;
- u32 bidx, boff, bsize;
+ size_t pos = 0;
+ size_t bidx, boff, bsize;
u32 tmp;
- pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
- page->index, to);
- BUG_ON(to > PAGE_SIZE);
+ pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino,
+ folio->index, to);
+ BUG_ON(to > folio_size(folio));
bsize = AFFS_SB(sb)->s_data_blksize;
- tmp = page->index << PAGE_SHIFT;
+ tmp = folio_pos(folio);
bidx = tmp / bsize;
boff = tmp % bsize;
@@ -544,7 +546,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create)
return PTR_ERR(bh);
tmp = min(bsize - boff, to - pos);
BUG_ON(pos + tmp > to || tmp > bsize);
- memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp);
+ memcpy_to_folio(folio, pos, AFFS_DATA(bh) + boff, tmp);
affs_brelse(bh);
bidx++;
pos += tmp;
@@ -624,25 +626,23 @@ out:
return PTR_ERR(bh);
}
-static int
-affs_read_folio_ofs(struct file *file, struct folio *folio)
+static int affs_read_folio_ofs(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *inode = page->mapping->host;
- u32 to;
+ struct inode *inode = folio->mapping->host;
+ size_t to;
int err;
- pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
- to = PAGE_SIZE;
- if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) {
- to = inode->i_size & ~PAGE_MASK;
- memset(page_address(page) + to, 0, PAGE_SIZE - to);
+ pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index);
+ to = folio_size(folio);
+ if (folio_pos(folio) + to > inode->i_size) {
+ to = inode->i_size - folio_pos(folio);
+ folio_zero_segment(folio, to, folio_size(folio));
}
- err = affs_do_readpage_ofs(page, to, 0);
+ err = affs_do_read_folio_ofs(folio, to, 0);
if (!err)
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return err;
}
@@ -651,7 +651,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
- struct page *page;
+ struct folio *folio;
pgoff_t index;
int err = 0;
@@ -667,19 +667,20 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ *pagep = &folio->page;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(page, PAGE_SIZE, 1);
+ err = affs_do_read_folio_ofs(folio, folio_size(folio), 1);
if (err) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
}
return err;
}
@@ -688,6 +689,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh, *prev_bh;
@@ -701,18 +703,18 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
to = from + len;
/*
* XXX: not sure if this can handle short copies (len < copied), but
- * we don't have to, because the page should always be uptodate here,
+ * we don't have to, because the folio should always be uptodate here,
* due to write_begin.
*/
pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
pos + len);
bsize = AFFS_SB(sb)->s_data_blksize;
- data = page_address(page);
+ data = folio_address(folio);
bh = NULL;
written = 0;
- tmp = (page->index << PAGE_SHIFT) + from;
+ tmp = (folio->index << PAGE_SHIFT) + from;
bidx = tmp / bsize;
boff = tmp % bsize;
if (boff) {
@@ -804,11 +806,11 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
from += tmp;
bidx++;
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
done:
affs_brelse(bh);
- tmp = (page->index << PAGE_SHIFT) + from;
+ tmp = (folio->index << PAGE_SHIFT) + from;
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
@@ -819,8 +821,8 @@ done:
}
err_first_bh:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return written;
@@ -835,9 +837,10 @@ const struct address_space_operations affs_aops_ofs = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = affs_read_folio_ofs,
- //.writepage = affs_writepage_ofs,
+ //.writepages = affs_writepages_ofs,
.write_begin = affs_write_begin_ofs,
- .write_end = affs_write_end_ofs
+ .write_end = affs_write_end_ofs,
+ .migrate_folio = filemap_migrate_folio,
};
/* Free any preallocated blocks. */
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 27f77a52c5c8..060746c63151 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -149,13 +149,13 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
break;
}
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec
- = (be32_to_cpu(tail->change.days) * 86400LL +
- be32_to_cpu(tail->change.mins) * 60 +
- be32_to_cpu(tail->change.ticks) / 50 +
- AFFS_EPOCH_DELTA) +
- sys_tz.tz_minuteswest * 60;
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0;
+ inode->i_mtime.tv_sec = inode->i_atime.tv_sec =
+ inode_set_ctime(inode,
+ (be32_to_cpu(tail->change.days) * 86400LL +
+ be32_to_cpu(tail->change.mins) * 60 +
+ be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA)
+ + sys_tz.tz_minuteswest * 60, 0).tv_sec;
+ inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0;
affs_brelse(bh);
unlock_new_inode(inode);
return inode;
@@ -314,7 +314,7 @@ affs_new_inode(struct inode *dir)
inode->i_gid = current_fsgid();
inode->i_ino = block;
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
atomic_set(&AFFS_I(inode)->i_opencnt, 0);
AFFS_I(inode)->i_blkcnt = 0;
AFFS_I(inode)->i_lc = NULL;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d12ccfd2a83d..2fe4a5832fcf 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -43,7 +43,7 @@ affs_get_toupper(struct super_block *sb)
* Note: the dentry argument is the parent dentry.
*/
static inline int
-__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate)
+__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t fn, bool notruncate)
{
const u8 *name = qstr->name;
unsigned long hash;
@@ -57,7 +57,7 @@ __affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t tou
hash = init_name_hash(dentry);
len = min(qstr->len, AFFSNAMEMAX);
for (; len > 0; name++, len--)
- hash = partial_name_hash(toupper(*name), hash);
+ hash = partial_name_hash(fn(*name), hash);
qstr->hash = end_name_hash(hash);
return 0;
@@ -80,7 +80,7 @@ affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
}
static inline int __affs_compare_dentry(unsigned int len,
- const char *str, const struct qstr *name, toupper_t toupper,
+ const char *str, const struct qstr *name, toupper_t fn,
bool notruncate)
{
const u8 *aname = str;
@@ -106,7 +106,7 @@ static inline int __affs_compare_dentry(unsigned int len,
return 1;
for (; len > 0; len--)
- if (toupper(*aname++) != toupper(*bname++))
+ if (fn(*aname++) != fn(*bname++))
return 1;
return 0;
@@ -135,7 +135,7 @@ affs_intl_compare_dentry(const struct dentry *dentry,
*/
static inline int
-affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
+affs_match(struct dentry *dentry, const u8 *name2, toupper_t fn)
{
const u8 *name = dentry->d_name.name;
int len = dentry->d_name.len;
@@ -148,7 +148,7 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
return 0;
for (name2++; len > 0; len--)
- if (toupper(*name++) != toupper(*name2++))
+ if (fn(*name++) != fn(*name2++))
return 0;
return 1;
}
@@ -156,12 +156,12 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
int
affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
{
- toupper_t toupper = affs_get_toupper(sb);
+ toupper_t fn = affs_get_toupper(sb);
u32 hash;
hash = len = min(len, AFFSNAMEMAX);
for (; len > 0; len--)
- hash = (hash * 13 + toupper(*name++)) & 0x7ff;
+ hash = (hash * 13 + fn(*name++)) & 0x7ff;
return hash % AFFS_SB(sb)->s_hashsize;
}
@@ -171,7 +171,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
{
struct super_block *sb = dir->i_sb;
struct buffer_head *bh;
- toupper_t toupper = affs_get_toupper(sb);
+ toupper_t fn = affs_get_toupper(sb);
u32 key;
pr_debug("%s(\"%pd\")\n", __func__, dentry);
@@ -189,7 +189,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
bh = affs_bread(sb, key);
if (!bh)
return ERR_PTR(-EIO);
- if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper))
+ if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, fn))
return bh;
key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain);
}
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 31d6446dc166..094aec8d17b8 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -13,10 +13,9 @@
static int affs_symlink_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
- char *link = page_address(page);
+ struct inode *inode = folio->mapping->host;
+ char *link = folio_address(folio);
struct slink_front *lf;
int i, j;
char c;
@@ -58,12 +57,11 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio)
}
link[i] = '\0';
affs_brelse(bh);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
fail:
- SetPageError(page);
- unlock_page(page);
+ folio_unlock(folio);
return -EIO;
}
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index d7d9402ff718..95bcbd7654d1 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
set_nlink(inode, 2);
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
inode->i_generation = 0;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 866bab860a88..1c794a1896aa 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -90,7 +90,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
vnode->status = *status;
t = status->mtime_client;
- inode->i_ctime = t;
+ inode_set_ctime_to_ts(inode, t);
inode->i_mtime = t;
inode->i_atime = t;
inode->i_flags |= S_NOATIME;
@@ -206,7 +206,7 @@ static void afs_apply_status(struct afs_operation *op,
t = status->mtime_client;
inode->i_mtime = t;
if (vp->update_ctime)
- inode->i_ctime = op->ctime;
+ inode_set_ctime_to_ts(inode, op->ctime);
if (vnode->status.data_version != status->data_version)
data_changed = true;
@@ -252,7 +252,7 @@ static void afs_apply_status(struct afs_operation *op,
vnode->netfs.remote_i_size = status->size;
if (change_size) {
afs_set_i_size(vnode, status->size);
- inode->i_ctime = t;
+ inode_set_ctime_to_ts(inode, t);
inode->i_atime = t;
}
}
@@ -773,7 +773,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
stat->nlink -= 1;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9d3d64921106..da73b97e19a9 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -681,6 +681,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
{
#ifdef CONFIG_AFS_FSCACHE
vnode->netfs.cache = cookie;
+ if (cookie)
+ mapping_set_release_always(vnode->netfs.inode.i_mapping);
#endif
}
diff --git a/fs/aio.c b/fs/aio.c
index 77e33619de40..f8589caef9c1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -80,7 +80,7 @@ struct aio_ring {
struct kioctx_table {
struct rcu_head rcu;
unsigned nr;
- struct kioctx __rcu *table[];
+ struct kioctx __rcu *table[] __counted_by(nr);
};
struct kioctx_cpu {
@@ -558,7 +558,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
PROT_READ | PROT_WRITE,
- MAP_SHARED, 0, &unused, NULL);
+ MAP_SHARED, 0, 0, &unused, NULL);
mmap_write_unlock(mm);
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
@@ -1447,13 +1447,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
if (kiocb->ki_flags & IOCB_WRITE) {
struct inode *inode = file_inode(kiocb->ki_filp);
- /*
- * Tell lockdep we inherited freeze protection from submission
- * thread.
- */
if (S_ISREG(inode->i_mode))
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
- file_end_write(kiocb->ki_filp);
+ kiocb_end_write(kiocb);
}
iocb->ki_res.res = res;
@@ -1581,17 +1576,8 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) {
- /*
- * Open-code file_start_write here to grab freeze protection,
- * which will be released by another thread in
- * aio_complete_rw(). Fool lockdep by telling it the lock got
- * released so that it doesn't complain about the held lock when
- * we return to userspace.
- */
- if (S_ISREG(file_inode(file)->i_mode)) {
- sb_start_write(file_inode(file)->i_sb);
- __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
- }
+ if (S_ISREG(file_inode(file)->i_mode))
+ kiocb_start_write(req);
req->ki_flags |= IOCB_WRITE;
aio_rw_done(req, call_write_iter(file, req, &iter));
}
diff --git a/fs/attr.c b/fs/attr.c
index d60dc1edb526..a8ae5f6d9b16 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -312,7 +312,7 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
if (ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
if (!in_group_or_capable(idmap, inode,
@@ -394,9 +394,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
return error;
if ((ia_valid & ATTR_MODE)) {
- umode_t amode = attr->ia_mode;
+ /*
+ * Don't allow changing the mode of symlinks:
+ *
+ * (1) The vfs doesn't take the mode of symlinks into account
+ * during permission checking.
+ * (2) This has never worked correctly. Most major filesystems
+ * did return EOPNOTSUPP due to interactions with POSIX ACLs
+ * but did still updated the mode of the symlink.
+ * This inconsistency led system call wrapper providers such
+ * as libc to block changing the mode of symlinks with
+ * EOPNOTSUPP already.
+ * (3) To even do this in the first place one would have to use
+ * specific file descriptors and quite some effort.
+ */
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
/* Flag setting protected by i_mutex */
- if (is_sxid(amode))
+ if (is_sxid(attr->ia_mode))
inode->i_flags &= ~S_NOSEC;
}
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index affa70360b1f..2b49662ed237 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -370,7 +370,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
inode->i_uid = d_inode(sb->s_root)->i_uid;
inode->i_gid = d_inode(sb->s_root)->i_gid;
}
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_ino = get_next_ino();
if (S_ISDIR(mode)) {
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 93046c9dc461..512b9a26c63d 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
return 0;
}
@@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
d_inode(dentry)->i_size = 0;
clear_nlink(d_inode(dentry));
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
spin_lock(&sbi->lookup_lock);
__autofs_add_expiring(dentry);
@@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
p_ino = autofs_dentry_ino(dentry->d_parent);
p_ino->count++;
inc_nlink(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
return 0;
}
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 54c1f8b8b075..33dd4660d82f 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -32,8 +32,9 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
wq->status = -ENOENT; /* Magic is gone - report failure */
kfree(wq->name.name - wq->offset);
wq->name.name = NULL;
- wq->wait_ctr--;
- wake_up_interruptible(&wq->queue);
+ wake_up(&wq->queue);
+ if (!--wq->wait_ctr)
+ kfree(wq);
wq = nwq;
}
fput(sbi->pipe); /* Close the pipe */
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index db649487d58c..83f9566c973b 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -133,8 +133,7 @@ static int bad_inode_fiemap(struct inode *inode,
return -EIO;
}
-static int bad_inode_update_time(struct inode *inode, struct timespec64 *time,
- int flags)
+static int bad_inode_update_time(struct inode *inode, int flags)
{
return -EIO;
}
@@ -209,8 +208,7 @@ void make_bad_inode(struct inode *inode)
remove_inode_hash(inode);
inode->i_mode = S_IFREG;
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_op = &bad_inode_ops;
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &bad_file_ops;
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 9550b6462b81..5fcfc4024ffe 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -2,6 +2,7 @@
config BEFS_FS
tristate "BeOS file system (BeFS) support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
help
The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index eee9237386e2..9a16a51fbb88 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -363,7 +363,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_mtime.tv_sec =
fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */
- inode->i_ctime = inode->i_mtime;
+ inode_set_ctime_to_ts(inode, inode->i_mtime);
inode->i_atime = inode->i_mtime;
befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num);
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index 3a757805b585..8e7ef866b62a 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -2,6 +2,7 @@
config BFS_FS
tristate "BFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
Boot File System (BFS) is a file system used under SCO UnixWare to
allow the bootloader access to the kernel image and other important
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 040d5140e426..12b8af04dcb3 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir,
set_bit(ino, info->si_imap);
info->si_freei--;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
inode->i_op = &bfs_file_inops;
inode->i_fop = &bfs_file_operations;
@@ -158,7 +158,7 @@ static int bfs_link(struct dentry *old, struct inode *dir,
return err;
}
inc_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
ihold(inode);
d_instantiate(new, inode);
@@ -187,9 +187,9 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
}
de->ino = 0;
mark_buffer_dirty_inode(bh, dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
error = 0;
@@ -240,10 +240,10 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto end_rename;
}
old_de->ino = 0;
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
mark_inode_dirty(old_dir);
if (new_inode) {
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
inode_dec_link_count(new_inode);
}
mark_buffer_dirty_inode(old_bh, old_dir);
@@ -292,9 +292,9 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
pos = (block - sblock) * BFS_BSIZE + off;
if (pos >= dir->i_size) {
dir->i_size += BFS_DIRENT_SIZE;
- dir->i_ctime = current_time(dir);
+ inode_set_ctime_current(dir);
}
- dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
de->ino = cpu_to_le16((u16)ino);
for (i = 0; i < BFS_NAMELEN; i++)
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1926bec2c850..e6a76ae9eb44 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -82,10 +82,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
inode->i_blocks = BFS_FILEBLOCKS(di);
inode->i_atime.tv_sec = le32_to_cpu(di->i_atime);
inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime);
- inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime);
+ inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0);
inode->i_atime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
brelse(bh);
unlock_new_inode(inode);
@@ -143,7 +142,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
di->i_nlink = cpu_to_le32(inode->i_nlink);
di->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
- di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
i_sblock = BFS_I(inode)->i_sblock;
di->i_sblock = cpu_to_le32(i_sblock);
di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 1c6c5832af86..206812ce544a 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -138,7 +138,7 @@ static int is_constdisp(struct elfhdr *hdr)
static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
struct file *file)
{
- struct elf32_phdr *phdr;
+ struct elf_phdr *phdr;
unsigned long size;
int retval, loop;
loff_t pos = params->hdr.e_phoff;
@@ -345,10 +345,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
+ SET_PERSONALITY(exec_params.hdr);
if (elf_check_fdpic(&exec_params.hdr))
- set_personality(PER_LINUX_FDPIC);
- else
- set_personality(PER_LINUX);
+ current->personality |= PER_LINUX_FDPIC;
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -560,8 +559,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
sp &= ~7UL;
/* stack the load map(s) */
- len = sizeof(struct elf32_fdpic_loadmap);
- len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs;
+ len = sizeof(struct elf_fdpic_loadmap);
+ len += sizeof(struct elf_fdpic_loadseg) * exec_params->loadmap->nsegs;
sp = (sp - len) & ~7UL;
exec_params->map_addr = sp;
@@ -571,8 +570,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
current->mm->context.exec_fdpic_loadmap = (unsigned long) sp;
if (interp_params->loadmap) {
- len = sizeof(struct elf32_fdpic_loadmap);
- len += sizeof(struct elf32_fdpic_loadseg) *
+ len = sizeof(struct elf_fdpic_loadmap);
+ len += sizeof(struct elf_fdpic_loadseg) *
interp_params->loadmap->nsegs;
sp = (sp - len) & ~7UL;
interp_params->map_addr = sp;
@@ -740,13 +739,13 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
struct mm_struct *mm,
const char *what)
{
- struct elf32_fdpic_loadmap *loadmap;
+ struct elf_fdpic_loadmap *loadmap;
#ifdef CONFIG_MMU
- struct elf32_fdpic_loadseg *mseg;
+ struct elf_fdpic_loadseg *mseg;
unsigned long load_addr;
#endif
- struct elf32_fdpic_loadseg *seg;
- struct elf32_phdr *phdr;
+ struct elf_fdpic_loadseg *seg;
+ struct elf_phdr *phdr;
unsigned nloads, tmp;
unsigned long stop;
int loop, ret;
@@ -766,7 +765,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
params->loadmap = loadmap;
- loadmap->version = ELF32_FDPIC_LOADMAP_VERSION;
+ loadmap->version = ELF_FDPIC_LOADMAP_VERSION;
loadmap->nsegs = nloads;
/* map the requested LOADs into the memory space */
@@ -839,8 +838,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
if (phdr->p_vaddr >= seg->p_vaddr &&
phdr->p_vaddr + phdr->p_memsz <=
seg->p_vaddr + seg->p_memsz) {
- Elf32_Dyn __user *dyn;
- Elf32_Sword d_tag;
+ Elf_Dyn __user *dyn;
+ Elf_Sword d_tag;
params->dynamic_addr =
(phdr->p_vaddr - seg->p_vaddr) +
@@ -850,11 +849,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
* one item, and that the last item is a NULL
* entry */
if (phdr->p_memsz == 0 ||
- phdr->p_memsz % sizeof(Elf32_Dyn) != 0)
+ phdr->p_memsz % sizeof(Elf_Dyn) != 0)
goto dynamic_error;
- tmp = phdr->p_memsz / sizeof(Elf32_Dyn);
- dyn = (Elf32_Dyn __user *)params->dynamic_addr;
+ tmp = phdr->p_memsz / sizeof(Elf_Dyn);
+ dyn = (Elf_Dyn __user *)params->dynamic_addr;
if (get_user(d_tag, &dyn[tmp - 1].d_tag) ||
d_tag != 0)
goto dynamic_error;
@@ -923,8 +922,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
struct file *file,
struct mm_struct *mm)
{
- struct elf32_fdpic_loadseg *seg;
- struct elf32_phdr *phdr;
+ struct elf_fdpic_loadseg *seg;
+ struct elf_phdr *phdr;
unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0;
int loop, ret;
@@ -1007,8 +1006,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
struct file *file,
struct mm_struct *mm)
{
- struct elf32_fdpic_loadseg *seg;
- struct elf32_phdr *phdr;
+ struct elf_fdpic_loadseg *seg;
+ struct elf_phdr *phdr;
unsigned long load_addr, delta_vaddr;
int loop, dvset;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index bb202ad369d5..e0108d17b085 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -547,8 +547,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 7c2ee0795dc7..90aaedce1548 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1844,9 +1844,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
inode->i_mtime.tv_nsec);
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode->i_ctime.tv_sec);
+ inode_get_ctime(inode).tv_sec);
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode->i_ctime.tv_nsec);
+ inode_get_ctime(inode).tv_nsec);
btrfs_set_stack_timespec_sec(&inode_item->otime,
BTRFS_I(inode)->i_otime.tv_sec);
@@ -1897,8 +1897,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
- inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
- inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
+ inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
+ btrfs_stack_timespec_nsec(&inode_item->ctime));
BTRFS_I(inode)->i_otime.tv_sec =
btrfs_stack_timespec_sec(&inode_item->otime);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e8726a83b649..361535c71c0f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -876,9 +876,9 @@ static int prepare_uptodate_page(struct inode *inode,
return 0;
}
-static unsigned int get_prepare_fgp_flags(bool nowait)
+static fgf_t get_prepare_fgp_flags(bool nowait)
{
- unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+ fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
if (nowait)
fgp_flags |= FGP_NOWAIT;
@@ -910,7 +910,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
int i;
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- unsigned int fgp_flags = get_prepare_fgp_flags(nowait);
+ fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
int err = 0;
int faili;
@@ -1108,7 +1108,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
static void update_time_for_write(struct inode *inode)
{
- struct timespec64 now;
+ struct timespec64 now, ctime;
if (IS_NOCMTIME(inode))
return;
@@ -1117,8 +1117,9 @@ static void update_time_for_write(struct inode *inode)
if (!timespec64_equal(&inode->i_mtime, &now))
inode->i_mtime = now;
- if (!timespec64_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
+ ctime = inode_get_ctime(inode);
+ if (!timespec64_equal(&ctime, &now))
+ inode_set_ctime_to_ts(inode, now);
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
@@ -2471,10 +2472,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
*/
inode_inc_iversion(&inode->vfs_inode);
- if (!extent_info || extent_info->update_times) {
- inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode);
- inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime;
- }
+ if (!extent_info || extent_info->update_times)
+ inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
ret = btrfs_update_inode(trans, root, inode);
if (ret)
@@ -2715,8 +2714,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
ASSERT(trans != NULL);
inode_inc_iversion(inode);
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
updated_inode = true;
btrfs_end_transaction(trans);
@@ -2733,11 +2731,10 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
- struct timespec64 now = current_time(inode);
+ struct timespec64 now = inode_set_ctime_current(inode);
inode_inc_iversion(inode);
inode->i_mtime = now;
- inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -2808,7 +2805,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode,
if (IS_ERR(trans))
return PTR_ERR(trans);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
i_size_write(inode, end);
btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5e1b15e69ed6..7814b9d654ce 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3760,8 +3760,8 @@ static int btrfs_read_locked_inode(struct inode *inode,
inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
- inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
- inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
+ inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
+ btrfs_timespec_nsec(leaf, &inode_item->ctime));
BTRFS_I(inode)->i_otime.tv_sec =
btrfs_timespec_sec(leaf, &inode_item->otime);
@@ -3932,9 +3932,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
inode->i_mtime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode->i_ctime.tv_sec);
+ inode_get_ctime(inode).tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode->i_ctime.tv_nsec);
+ inode_get_ctime(inode).tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->otime,
BTRFS_I(inode)->i_otime.tv_sec);
@@ -4132,9 +4132,8 @@ err:
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
- dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
- dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
+ inode_set_ctime_current(&inode->vfs_inode);
+ dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
ret = btrfs_update_inode(trans, root, dir);
out:
return ret;
@@ -4307,8 +4306,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
inode_inc_iversion(&dir->vfs_inode);
- dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode);
- dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime;
+ dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -4958,8 +4956,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
if (newsize != oldsize) {
inode_inc_iversion(inode);
if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
+ inode->i_mtime = inode_set_ctime_current(inode);
}
}
@@ -5603,9 +5600,8 @@ static struct inode *new_simple_dir(struct inode *dir,
inode->i_opflags &= ~IOP_XATTR;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode->i_atime = dir->i_atime;
- inode->i_ctime = dir->i_ctime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
inode->i_uid = dir->i_uid;
inode->i_gid = dir->i_gid;
@@ -6025,8 +6021,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode)
* This is a copy of file_update_time. We need this so we can return error on
* ENOSPC for updating the inode in the case of file write and mmap writes.
*/
-static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
- int flags)
+static int btrfs_update_time(struct inode *inode, int flags)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
bool dirty = flags & ~S_VERSION;
@@ -6034,14 +6029,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
if (btrfs_root_readonly(root))
return -EROFS;
- if (flags & S_VERSION)
- dirty |= inode_maybe_inc_iversion(inode, dirty);
- if (flags & S_CTIME)
- inode->i_ctime = *now;
- if (flags & S_MTIME)
- inode->i_mtime = *now;
- if (flags & S_ATIME)
- inode->i_atime = *now;
+ dirty = inode_update_timestamps(inode, flags);
return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
}
@@ -6289,9 +6277,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
goto discard;
}
- inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode->i_atime = inode->i_mtime;
- inode->i_ctime = inode->i_mtime;
BTRFS_I(inode)->i_otime = inode->i_mtime;
/*
@@ -6456,12 +6443,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
* log replay procedure is responsible for setting them to their correct
* values (the ones it had when the fsync was done).
*/
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
- struct timespec64 now = current_time(&parent_inode->vfs_inode);
+ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
+ parent_inode->vfs_inode.i_mtime =
+ inode_set_ctime_current(&parent_inode->vfs_inode);
- parent_inode->vfs_inode.i_mtime = now;
- parent_inode->vfs_inode.i_ctime = now;
- }
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -6601,7 +6586,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
BTRFS_I(inode)->dir_index = 0ULL;
inc_nlink(inode);
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
@@ -8667,7 +8652,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
spin_lock(&BTRFS_I(inode)->lock);
@@ -8691,7 +8676,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
- struct timespec64 ctime = current_time(old_inode);
struct btrfs_rename_ctx old_rename_ctx;
struct btrfs_rename_ctx new_rename_ctx;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
@@ -8822,12 +8806,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
inode_inc_iversion(new_inode);
- old_dir->i_mtime = ctime;
- old_dir->i_ctime = ctime;
- new_dir->i_mtime = ctime;
- new_dir->i_ctime = ctime;
- old_inode->i_ctime = ctime;
- new_inode->i_ctime = ctime;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9091,11 +9070,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_dir);
inode_inc_iversion(new_dir);
inode_inc_iversion(old_inode);
- old_dir->i_mtime = current_time(old_dir);
- old_dir->i_ctime = old_dir->i_mtime;
- new_dir->i_mtime = old_dir->i_mtime;
- new_dir->i_ctime = old_dir->i_mtime;
- old_inode->i_ctime = old_dir->i_mtime;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9117,7 +9092,6 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (new_inode) {
inode_inc_iversion(new_inode);
- new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
@@ -9652,7 +9626,7 @@ next:
*alloc_hint = ins.objectid + ins.offset;
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
(actual_len > inode->i_size) &&
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bf35b6fce8f0..8e7d03bc1b56 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -384,7 +384,7 @@ update_flags:
binode->flags = binode_flags;
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out_end_trans:
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 005751a12911..40f2d9f1a17a 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -8,8 +8,6 @@
#include <linux/math64.h>
#include <linux/rbtree.h>
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
-
/*
* Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
*/
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 0474bbe39da7..65d2bd6910f2 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -30,8 +30,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
inode_inc_iversion(inode);
if (!no_time_update) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
+ inode->i_mtime = inode_set_ctime_current(inode);
}
/*
* We round up to the block size at eof when determining which
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b60e56e5e02..c780d3729463 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1860,8 +1860,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
fname.disk_name.len * 2);
- parent_inode->i_mtime = current_time(parent_inode);
- parent_inode->i_ctime = parent_inode->i_mtime;
+ parent_inode->i_mtime = inode_set_ctime_current(parent_inode);
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
if (ret) {
btrfs_abort_transaction(trans, ret);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 537eb3de8809..cbb17b542131 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4148,9 +4148,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
inode->i_mtime.tv_nsec);
btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode->i_ctime.tv_sec);
+ inode_get_ctime(inode).tv_sec);
btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode->i_ctime.tv_nsec);
+ inode_get_ctime(inode).tv_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 43fee429834a..b9ef6f54635c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1911,15 +1911,13 @@ out:
static void update_dev_time(const char *device_path)
{
struct path path;
- struct timespec64 now;
int ret;
ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
if (ret)
return;
- now = current_time(d_inode(path.dentry));
- inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION);
+ inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
path_put(&path);
}
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index fc4b20c2688a..96828a13dd43 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -264,7 +264,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name,
goto out;
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -407,7 +407,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
ret = btrfs_set_prop(trans, inode, name, value, size, flags);
if (!ret) {
inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
if (ret)
btrfs_abort_transaction(trans, ret);
diff --git a/fs/buffer.c b/fs/buffer.c
index bd091329026c..12e9a71c693d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -49,6 +49,7 @@
#include <trace/events/block.h>
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
+#include <linux/sched/isolation.h>
#include "internal.h"
@@ -562,12 +563,6 @@ repeat:
return err;
}
-void emergency_thaw_bdev(struct super_block *sb)
-{
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
- printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
-}
-
/**
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
@@ -1225,19 +1220,14 @@ EXPORT_SYMBOL(mark_buffer_dirty);
void mark_buffer_write_io_error(struct buffer_head *bh)
{
- struct super_block *sb;
-
set_buffer_write_io_error(bh);
/* FIXME: do we need to set this in both places? */
if (bh->b_folio && bh->b_folio->mapping)
mapping_set_error(bh->b_folio->mapping, -EIO);
- if (bh->b_assoc_map)
+ if (bh->b_assoc_map) {
mapping_set_error(bh->b_assoc_map, -EIO);
- rcu_read_lock();
- sb = READ_ONCE(bh->b_bdev->bd_super);
- if (sb)
- errseq_set(&sb->s_wb_err, -EIO);
- rcu_read_unlock();
+ errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
+ }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
@@ -1352,7 +1342,7 @@ static void bh_lru_install(struct buffer_head *bh)
* failing page migration.
* Skip putting upcoming bh into bh_lru until migration is done.
*/
- if (lru_cache_disabled()) {
+ if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
bh_lru_unlock();
return;
}
@@ -1382,6 +1372,10 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
check_irqs_on();
bh_lru_lock();
+ if (cpu_is_isolated(smp_processor_id())) {
+ bh_lru_unlock();
+ return NULL;
+ }
for (i = 0; i < BH_LRU_SIZE; i++) {
struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
@@ -1539,21 +1533,6 @@ void invalidate_bh_lrus_cpu(void)
bh_lru_unlock();
}
-void set_bh_page(struct buffer_head *bh,
- struct page *page, unsigned long offset)
-{
- bh->b_page = page;
- BUG_ON(offset >= PAGE_SIZE);
- if (PageHighMem(page))
- /*
- * This catches illegal uses and preserves the offset:
- */
- bh->b_data = (char *)(0 + offset);
- else
- bh->b_data = page_address(page) + offset;
-}
-EXPORT_SYMBOL(set_bh_page);
-
void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset)
{
@@ -2032,7 +2011,7 @@ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
}
EXPORT_SYMBOL(folio_zero_new_buffers);
-static void
+static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
const struct iomap *iomap)
{
@@ -2046,7 +2025,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
* current block, then do not map the buffer and let the caller
* handle it.
*/
- BUG_ON(offset >= iomap->offset + iomap->length);
+ if (offset >= iomap->offset + iomap->length)
+ return -EIO;
switch (iomap->type) {
case IOMAP_HOLE:
@@ -2058,7 +2038,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
set_buffer_new(bh);
- break;
+ return 0;
case IOMAP_DELALLOC:
if (!buffer_uptodate(bh) ||
(offset >= i_size_read(inode)))
@@ -2066,7 +2046,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
set_buffer_uptodate(bh);
set_buffer_mapped(bh);
set_buffer_delay(bh);
- break;
+ return 0;
case IOMAP_UNWRITTEN:
/*
* For unwritten regions, we always need to ensure that regions
@@ -2078,12 +2058,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
fallthrough;
case IOMAP_MAPPED:
if ((iomap->flags & IOMAP_F_NEW) ||
- offset >= i_size_read(inode))
+ offset >= i_size_read(inode)) {
+ /*
+ * This can happen if truncating the block device races
+ * with the check in the caller as i_size updates on
+ * block devices aren't synchronized by i_rwsem for
+ * block devices.
+ */
+ if (S_ISBLK(inode->i_mode))
+ return -EIO;
set_buffer_new(bh);
+ }
bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
inode->i_blkbits;
set_buffer_mapped(bh);
- break;
+ return 0;
+ default:
+ WARN_ON_ONCE(1);
+ return -EIO;
}
}
@@ -2124,13 +2116,12 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- if (get_block) {
+ if (get_block)
err = get_block(inode, block, bh, 1);
- if (err)
- break;
- } else {
- iomap_to_bh(inode, block, bh, iomap);
- }
+ else
+ err = iomap_to_bh(inode, block, bh, iomap);
+ if (err)
+ break;
if (buffer_new(bh)) {
clean_bdev_bh_alias(bh);
@@ -2180,8 +2171,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(__block_write_begin);
-static int __block_commit_write(struct inode *inode, struct folio *folio,
- size_t from, size_t to)
+static void __block_commit_write(struct folio *folio, size_t from, size_t to)
{
size_t block_start, block_end;
bool partial = false;
@@ -2216,7 +2206,6 @@ static int __block_commit_write(struct inode *inode, struct folio *folio,
*/
if (!partial)
folio_mark_uptodate(folio);
- return 0;
}
/*
@@ -2253,7 +2242,6 @@ int block_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct folio *folio = page_folio(page);
- struct inode *inode = mapping->host;
size_t start = pos - folio_pos(folio);
if (unlikely(copied < len)) {
@@ -2277,7 +2265,7 @@ int block_write_end(struct file *file, struct address_space *mapping,
flush_dcache_folio(folio);
/* This could be a short (even 0-length) commit */
- __block_commit_write(inode, folio, start, start + copied);
+ __block_commit_write(folio, start, start + copied);
return copied;
}
@@ -2598,12 +2586,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(cont_write_begin);
-int block_commit_write(struct page *page, unsigned from, unsigned to)
+void block_commit_write(struct page *page, unsigned from, unsigned to)
{
struct folio *folio = page_folio(page);
- struct inode *inode = folio->mapping->host;
- __block_commit_write(inode, folio, from, to);
- return 0;
+ __block_commit_write(folio, from, to);
}
EXPORT_SYMBOL(block_commit_write);
@@ -2649,11 +2635,11 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
end = size - folio_pos(folio);
ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
- if (!ret)
- ret = __block_commit_write(inode, folio, 0, end);
-
- if (unlikely(ret < 0))
+ if (unlikely(ret))
goto out_unlock;
+
+ __block_commit_write(folio, 0, end);
+
folio_mark_dirty(folio);
folio_wait_stable(folio);
return 0;
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 175a25fcade8..009d23cd435b 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
_enter("%ld", ret);
- /* Tell lockdep we inherited freeze protection from submission thread */
- __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
- __sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+ kiocb_end_write(iocb);
if (ret < 0)
trace_cachefiles_io_error(object, inode, ret,
@@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object,
{
struct cachefiles_cache *cache;
struct cachefiles_kiocb *ki;
- struct inode *inode;
unsigned int old_nofs;
ssize_t ret;
size_t len = iov_iter_count(iter);
@@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object,
ki->iocb.ki_complete = cachefiles_write_complete;
atomic_long_add(ki->b_writing, &cache->b_writing);
- /* Open-code file_start_write here to grab freeze protection, which
- * will be released by another thread in aio_complete_rw(). Fool
- * lockdep by telling it the lock got released so that it doesn't
- * complain about the held lock when we return to userspace.
- */
- inode = file_inode(file);
- __sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
- __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+ kiocb_start_write(&ki->iocb);
get_file(ki->iocb.ki_filp);
cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
- trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
+ trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len);
old_nofs = memalloc_nofs_save();
ret = cachefiles_inject_write_error();
if (ret == 0)
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d9d22d0ec38a..7bf7a5fcc045 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
if (ret < 0)
goto check_failed;
+ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags);
+
object->file = file;
/* Always update the atime on an object we've just looked up (this is
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 50c635dc7f71..1f77ca04c426 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -12,3 +12,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
+ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 6945a938d396..c53a1d220622 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -93,7 +93,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
char *value = NULL;
struct iattr newattrs;
struct inode *inode = d_inode(dentry);
- struct timespec64 old_ctime = inode->i_ctime;
+ struct timespec64 old_ctime = inode_get_ctime(inode);
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -140,7 +140,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- ret = __ceph_setattr(inode, &newattrs);
+ ret = __ceph_setattr(inode, &newattrs, NULL);
if (ret)
goto out_free;
}
@@ -151,7 +151,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
newattrs.ia_ctime = old_ctime;
newattrs.ia_mode = old_mode;
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- __ceph_setattr(inode, &newattrs);
+ __ceph_setattr(inode, &newattrs, NULL);
}
goto out_free;
}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 59cbfb80edbd..f4863078f7fe 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -18,6 +18,7 @@
#include "mds_client.h"
#include "cache.h"
#include "metric.h"
+#include "crypto.h"
#include <linux/ceph/osd_client.h>
#include <linux/ceph/striper.h>
@@ -242,11 +243,13 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
static void finish_netfs_read(struct ceph_osd_request *req)
{
- struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
+ struct inode *inode = req->r_inode;
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
struct netfs_io_subrequest *subreq = req->r_priv;
- int num_pages;
+ struct ceph_osd_req_op *op = &req->r_ops[0];
int err = req->r_result;
+ bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, osd_data->length, err);
@@ -260,14 +263,29 @@ static void finish_netfs_read(struct ceph_osd_request *req)
else if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
- if (err >= 0 && err < subreq->len)
- __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (err >= 0) {
+ if (sparse && err > 0)
+ err = ceph_sparse_ext_map_end(op);
+ if (err < subreq->len)
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ if (IS_ENCRYPTED(inode) && err > 0) {
+ err = ceph_fscrypt_decrypt_extents(inode,
+ osd_data->pages, subreq->start,
+ op->extent.sparse_ext,
+ op->extent.sparse_ext_cnt);
+ if (err > subreq->len)
+ err = subreq->len;
+ }
+ }
+ if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+ ceph_put_page_vector(osd_data->pages,
+ calc_pages_for(osd_data->alignment,
+ osd_data->length), false);
+ }
netfs_subreq_terminated(subreq, err, false);
-
- num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
- ceph_put_page_vector(osd_data->pages, num_pages, false);
iput(req->r_inode);
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
@@ -334,10 +352,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct ceph_osd_request *req = NULL;
struct ceph_vino vino = ceph_vino(inode);
struct iov_iter iter;
- struct page **pages;
- size_t page_off;
int err = 0;
u64 len = subreq->len;
+ bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
+ u64 off = subreq->start;
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -347,8 +365,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
return;
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
- 0, 1, CEPH_OSD_OP_READ,
+ ceph_fscrypt_adjust_off_and_len(inode, &off, &len);
+
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
+ off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
if (IS_ERR(req)) {
@@ -357,20 +377,48 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
goto out;
}
+ if (sparse) {
+ err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
+ if (err)
+ goto out;
+ }
+
dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
+
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
- err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
- if (err < 0) {
- dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
- goto out;
- }
- /* should always give us a page-aligned read */
- WARN_ON_ONCE(page_off);
- len = err;
- err = 0;
+ /*
+ * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
+ * encrypted inodes. We'd need infrastructure that handles an iov_iter
+ * instead of page arrays, and we don't have that as of yet. Once the
+ * dust settles on the write helpers and encrypt/decrypt routines for
+ * netfs, we should be able to rework this.
+ */
+ if (IS_ENCRYPTED(inode)) {
+ struct page **pages;
+ size_t page_off;
+
+ err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
+ if (err < 0) {
+ dout("%s: iov_ter_get_pages_alloc returned %d\n",
+ __func__, err);
+ goto out;
+ }
+
+ /* should always give us a page-aligned read */
+ WARN_ON_ONCE(page_off);
+ len = err;
+ err = 0;
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
+ false);
+ } else {
+ osd_req_op_extent_osd_iter(req, 0, &iter);
+ }
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ err = -EIO;
+ goto out;
+ }
req->r_callback = finish_netfs_read;
req->r_priv = subreq;
req->r_inode = inode;
@@ -571,10 +619,12 @@ static u64 get_writepages_data_length(struct inode *inode,
struct page *page, u64 start)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_snap_context *snapc = page_snap_context(page);
+ struct ceph_snap_context *snapc;
struct ceph_cap_snap *capsnap = NULL;
u64 end = i_size_read(inode);
+ u64 ret;
+ snapc = page_snap_context(ceph_fscrypt_pagecache_page(page));
if (snapc != ci->i_head_snapc) {
bool found = false;
spin_lock(&ci->i_ceph_lock);
@@ -589,9 +639,12 @@ static u64 get_writepages_data_length(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
WARN_ON(!found);
}
- if (end > page_offset(page) + thp_size(page))
- end = page_offset(page) + thp_size(page);
- return end > start ? end - start : 0;
+ if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
+ end = ceph_fscrypt_page_offset(page) + thp_size(page);
+ ret = end > start ? end - start : 0;
+ if (ret && fscrypt_is_bounce_page(page))
+ ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
+ return ret;
}
/*
@@ -610,10 +663,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
loff_t page_off = page_offset(page);
int err;
loff_t len = thp_size(page);
+ loff_t wlen;
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
bool caching = ceph_is_cache_enabled(inode);
+ struct page *bounce_page = NULL;
dout("writepage %p idx %lu\n", page, page->index);
@@ -649,31 +704,51 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (ceph_wbc.i_size < page_off + len)
len = ceph_wbc.i_size - page_off;
+ wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
- inode, page, page->index, page_off, len, snapc, snapc->seq);
+ inode, page, page->index, page_off, wlen, snapc, snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
fsc->write_congested = true;
- req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
- ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
- true);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
+ page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE, snapc,
+ ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size, true);
if (IS_ERR(req)) {
redirty_page_for_writepage(wbc, page);
return PTR_ERR(req);
}
+ if (wlen < len)
+ len = wlen;
+
set_page_writeback(page);
if (caching)
ceph_set_page_fscache(page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
+ if (IS_ENCRYPTED(inode)) {
+ bounce_page = fscrypt_encrypt_pagecache_blocks(page,
+ CEPH_FSCRYPT_BLOCK_SIZE, 0,
+ GFP_NOFS);
+ if (IS_ERR(bounce_page)) {
+ redirty_page_for_writepage(wbc, page);
+ end_page_writeback(page);
+ ceph_osdc_put_request(req);
+ return PTR_ERR(bounce_page);
+ }
+ }
+
/* it may be a short write due to an object boundary */
WARN_ON_ONCE(len > thp_size(page));
- osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
- dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
+ osd_req_op_extent_osd_data_pages(req, 0,
+ bounce_page ? &bounce_page : &page, wlen, 0,
+ false, false);
+ dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
+ page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");
req->r_mtime = inode->i_mtime;
ceph_osdc_start_request(osdc, req);
@@ -681,7 +756,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, err);
-
+ fscrypt_free_bounce_page(bounce_page);
ceph_osdc_put_request(req);
if (err == 0)
err = len;
@@ -800,6 +875,11 @@ static void writepages_finish(struct ceph_osd_request *req)
total_pages += num_pages;
for (j = 0; j < num_pages; j++) {
page = osd_data->pages[j];
+ if (fscrypt_is_bounce_page(page)) {
+ page = fscrypt_pagecache_page(page);
+ fscrypt_free_bounce_page(osd_data->pages[j]);
+ osd_data->pages[j] = page;
+ }
BUG_ON(!page);
WARN_ON(!PageUptodate(page));
@@ -835,6 +915,7 @@ static void writepages_finish(struct ceph_osd_request *req)
else
kfree(osd_data->pages);
ceph_osdc_put_request(req);
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
/*
@@ -1070,9 +1151,28 @@ get_more_pages:
fsc->mount_options->congestion_kb))
fsc->write_congested = true;
- pages[locked_pages++] = page;
- fbatch.folios[i] = NULL;
+ if (IS_ENCRYPTED(inode)) {
+ pages[locked_pages] =
+ fscrypt_encrypt_pagecache_blocks(page,
+ PAGE_SIZE, 0,
+ locked_pages ? GFP_NOWAIT : GFP_NOFS);
+ if (IS_ERR(pages[locked_pages])) {
+ if (PTR_ERR(pages[locked_pages]) == -EINVAL)
+ pr_err("%s: inode->i_blkbits=%hhu\n",
+ __func__, inode->i_blkbits);
+ /* better not fail on first page! */
+ BUG_ON(locked_pages == 0);
+ pages[locked_pages] = NULL;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ break;
+ }
+ ++locked_pages;
+ } else {
+ pages[locked_pages++] = page;
+ }
+ fbatch.folios[i] = NULL;
len += thp_size(page);
}
@@ -1100,7 +1200,7 @@ get_more_pages:
}
new_request:
- offset = page_offset(pages[0]);
+ offset = ceph_fscrypt_page_offset(pages[0]);
len = wsize;
req = ceph_osdc_new_request(&fsc->client->osdc,
@@ -1121,9 +1221,13 @@ new_request:
ceph_wbc.truncate_size, true);
BUG_ON(IS_ERR(req));
}
- BUG_ON(len < page_offset(pages[locked_pages - 1]) +
- thp_size(page) - offset);
+ BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) +
+ thp_size(pages[locked_pages - 1]) - offset);
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ rc = -EIO;
+ goto release_folios;
+ }
req->r_callback = writepages_finish;
req->r_inode = inode;
@@ -1132,7 +1236,9 @@ new_request:
data_pages = pages;
op_idx = 0;
for (i = 0; i < locked_pages; i++) {
- u64 cur_offset = page_offset(pages[i]);
+ struct page *page = ceph_fscrypt_pagecache_page(pages[i]);
+
+ u64 cur_offset = page_offset(page);
/*
* Discontinuity in page range? Ceph can handle that by just passing
* multiple extents in the write op.
@@ -1161,9 +1267,9 @@ new_request:
op_idx++;
}
- set_page_writeback(pages[i]);
+ set_page_writeback(page);
if (caching)
- ceph_set_page_fscache(pages[i]);
+ ceph_set_page_fscache(page);
len += thp_size(page);
}
ceph_fscache_write_to_cache(inode, offset, len, caching);
@@ -1179,8 +1285,16 @@ new_request:
offset);
len = max(len, min_len);
}
+ if (IS_ENCRYPTED(inode))
+ len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+
dout("writepages got pages at %llu~%llu\n", offset, len);
+ if (IS_ENCRYPTED(inode) &&
+ ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
+ pr_warn("%s: bad encrypted write offset=%lld len=%llu\n",
+ __func__, offset, len);
+
osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
0, from_pool, false);
osd_req_op_extent_update(req, op_idx, len);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 177d8e8d73fe..de1dee46d3df 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
&ci->i_vino, sizeof(ci->i_vino),
&ci->i_version, sizeof(ci->i_version),
i_size_read(inode));
+ if (ci->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
}
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e2bb0d0072da..14215ec646f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -14,6 +14,7 @@
#include "super.h"
#include "mds_client.h"
#include "cache.h"
+#include "crypto.h"
#include <linux/ceph/decode.h>
#include <linux/ceph/messenger.h>
@@ -1216,15 +1217,11 @@ struct cap_msg_args {
umode_t mode;
bool inline_data;
bool wake;
+ bool encrypted;
+ u32 fscrypt_auth_len;
+ u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context
};
-/*
- * cap struct size + flock buffer size + inline version + inline data size +
- * osd_epoch_barrier + oldest_flush_tid
- */
-#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \
- 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4)
-
/* Marshal up the cap msg to the MDS */
static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
{
@@ -1240,7 +1237,7 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
arg->size, arg->max_size, arg->xattr_version,
arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
- msg->hdr.version = cpu_to_le16(10);
+ msg->hdr.version = cpu_to_le16(12);
msg->hdr.tid = cpu_to_le64(arg->flush_tid);
fc = msg->front.iov_base;
@@ -1257,7 +1254,13 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
fc->ino = cpu_to_le64(arg->ino);
fc->snap_follows = cpu_to_le64(arg->follows);
- fc->size = cpu_to_le64(arg->size);
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (arg->encrypted)
+ fc->size = cpu_to_le64(round_up(arg->size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ else
+#endif
+ fc->size = cpu_to_le64(arg->size);
fc->max_size = cpu_to_le64(arg->max_size);
ceph_encode_timespec64(&fc->mtime, &arg->mtime);
ceph_encode_timespec64(&fc->atime, &arg->atime);
@@ -1311,6 +1314,27 @@ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg)
/* Advisory flags (version 10) */
ceph_encode_32(&p, arg->flags);
+
+ /* dirstats (version 11) - these are r/o on the client */
+ ceph_encode_64(&p, 0);
+ ceph_encode_64(&p, 0);
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ /*
+ * fscrypt_auth and fscrypt_file (version 12)
+ *
+ * fscrypt_auth holds the crypto context (if any). fscrypt_file
+ * tracks the real i_size as an __le64 field (and we use a rounded-up
+ * i_size in the traditional size field).
+ */
+ ceph_encode_32(&p, arg->fscrypt_auth_len);
+ ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len);
+ ceph_encode_32(&p, sizeof(__le64));
+ ceph_encode_64(&p, arg->size);
+#else /* CONFIG_FS_ENCRYPTION */
+ ceph_encode_32(&p, 0);
+ ceph_encode_32(&p, 0);
+#endif /* CONFIG_FS_ENCRYPTION */
}
/*
@@ -1378,7 +1402,6 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
arg->follows = flushing ? ci->i_head_snapc->seq : 0;
arg->flush_tid = flush_tid;
arg->oldest_flush_tid = oldest_flush_tid;
-
arg->size = i_size_read(inode);
ci->i_reported_size = arg->size;
arg->max_size = ci->i_wanted_max_size;
@@ -1400,7 +1423,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
arg->mtime = inode->i_mtime;
arg->atime = inode->i_atime;
- arg->ctime = inode->i_ctime;
+ arg->ctime = inode_get_ctime(inode);
arg->btime = ci->i_btime;
arg->change_attr = inode_peek_iversion_raw(inode);
@@ -1432,8 +1455,39 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
}
}
arg->flags = flags;
+ arg->encrypted = IS_ENCRYPTED(inode);
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (ci->fscrypt_auth_len &&
+ WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) {
+ /* Don't set this if it's too big */
+ arg->fscrypt_auth_len = 0;
+ } else {
+ arg->fscrypt_auth_len = ci->fscrypt_auth_len;
+ memcpy(arg->fscrypt_auth, ci->fscrypt_auth,
+ min_t(size_t, ci->fscrypt_auth_len,
+ sizeof(arg->fscrypt_auth)));
+ }
+#endif /* CONFIG_FS_ENCRYPTION */
}
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8)
+
+static inline int cap_msg_size(struct cap_msg_args *arg)
+{
+ return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len;
+}
+#else
+#define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \
+ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4)
+
+static inline int cap_msg_size(struct cap_msg_args *arg)
+{
+ return CAP_MSG_FIXED_FIELDS;
+}
+#endif /* CONFIG_FS_ENCRYPTION */
+
/*
* Send a cap msg on the given inode.
*
@@ -1444,7 +1498,8 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
struct ceph_msg *msg;
struct inode *inode = &ci->netfs.inode;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS,
+ false);
if (!msg) {
pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n",
ceph_vinop(inode), ceph_cap_string(arg->dirty),
@@ -1470,10 +1525,6 @@ static inline int __send_flush_snap(struct inode *inode,
struct cap_msg_args arg;
struct ceph_msg *msg;
- msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false);
- if (!msg)
- return -ENOMEM;
-
arg.session = session;
arg.ino = ceph_vino(inode).ino;
arg.cid = 0;
@@ -1510,6 +1561,15 @@ static inline int __send_flush_snap(struct inode *inode,
arg.inline_data = capsnap->inline_data;
arg.flags = 0;
arg.wake = false;
+ arg.encrypted = IS_ENCRYPTED(inode);
+
+ /* No fscrypt_auth changes from a capsnap.*/
+ arg.fscrypt_auth_len = 0;
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg),
+ GFP_NOFS, false);
+ if (!msg)
+ return -ENOMEM;
encode_cap_msg(msg, &arg);
ceph_con_send(&arg.session->s_con, msg);
@@ -2900,10 +2960,9 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
* due to a small max_size, make sure we check_max_size (and possibly
* ask the mds) so we don't get hung up indefinitely.
*/
-int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got)
+int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need,
+ int want, loff_t endoff, int *got)
{
- struct ceph_file_info *fi = filp->private_data;
- struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
int ret, _got, flags;
@@ -2912,7 +2971,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
if (ret < 0)
return ret;
- if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+ if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
fi->filp_gen != READ_ONCE(fsc->filp_gen))
return -EBADF;
@@ -2965,7 +3024,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
continue;
}
- if ((fi->fmode & CEPH_FILE_MODE_WR) &&
+ if (fi && (fi->fmode & CEPH_FILE_MODE_WR) &&
fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
if (ret >= 0 && _got)
ceph_put_cap_refs(ci, _got);
@@ -3028,6 +3087,15 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
return 0;
}
+int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff,
+ int *got)
+{
+ struct ceph_file_info *fi = filp->private_data;
+ struct inode *inode = file_inode(filp);
+
+ return __ceph_get_caps(inode, fi, need, want, endoff, got);
+}
+
/*
* Take cap refs. Caller must already know we hold at least one ref
* on the caps in question or we don't know this is safe.
@@ -3323,6 +3391,9 @@ struct cap_extra_info {
/* currently issued */
int issued;
struct timespec64 btime;
+ u8 *fscrypt_auth;
+ u32 fscrypt_auth_len;
+ u64 fscrypt_file_size;
};
/*
@@ -3355,6 +3426,14 @@ static void handle_cap_grant(struct inode *inode,
bool deleted_inode = false;
bool fill_inline = false;
+ /*
+ * If there is at least one crypto block then we'll trust
+ * fscrypt_file_size. If the real length of the file is 0, then
+ * ignore it (it has probably been truncated down to 0 by the MDS).
+ */
+ if (IS_ENCRYPTED(inode) && size)
+ size = extra_info->fscrypt_file_size;
+
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
@@ -3421,6 +3500,14 @@ static void handle_cap_grant(struct inode *inode,
dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
from_kuid(&init_user_ns, inode->i_uid),
from_kgid(&init_user_ns, inode->i_gid));
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len ||
+ memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth,
+ ci->fscrypt_auth_len))
+ pr_warn_ratelimited("%s: cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n",
+ __func__, ci->fscrypt_auth_len,
+ extra_info->fscrypt_auth_len);
+#endif
}
if ((newcaps & CEPH_CAP_LINK_SHARED) &&
@@ -3837,7 +3924,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
*/
static bool handle_cap_trunc(struct inode *inode,
struct ceph_mds_caps *trunc,
- struct ceph_mds_session *session)
+ struct ceph_mds_session *session,
+ struct cap_extra_info *extra_info)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
@@ -3854,8 +3942,16 @@ static bool handle_cap_trunc(struct inode *inode,
issued |= implemented | dirty;
- dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
- inode, mds, seq, truncate_size, truncate_seq);
+ /*
+ * If there is at least one crypto block then we'll trust
+ * fscrypt_file_size. If the real length of the file is 0, then
+ * ignore it (it has probably been truncated down to 0 by the MDS).
+ */
+ if (IS_ENCRYPTED(inode) && size)
+ size = extra_info->fscrypt_file_size;
+
+ dout("%s inode %p mds%d seq %d to %lld truncate seq %d\n",
+ __func__, inode, mds, seq, truncate_size, truncate_seq);
queue_trunc = ceph_fill_file_size(inode, issued,
truncate_seq, truncate_size, size);
return queue_trunc;
@@ -4075,6 +4171,52 @@ retry:
*target_cap = cap;
}
+#ifdef CONFIG_FS_ENCRYPTION
+static int parse_fscrypt_fields(void **p, void *end,
+ struct cap_extra_info *extra)
+{
+ u32 len;
+
+ ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad);
+ if (extra->fscrypt_auth_len) {
+ ceph_decode_need(p, end, extra->fscrypt_auth_len, bad);
+ extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (!extra->fscrypt_auth)
+ return -ENOMEM;
+ ceph_decode_copy_safe(p, end, extra->fscrypt_auth,
+ extra->fscrypt_auth_len, bad);
+ }
+
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len >= sizeof(u64)) {
+ ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad);
+ len -= sizeof(u64);
+ }
+ ceph_decode_skip_n(p, end, len, bad);
+ return 0;
+bad:
+ return -EIO;
+}
+#else
+static int parse_fscrypt_fields(void **p, void *end,
+ struct cap_extra_info *extra)
+{
+ u32 len;
+
+ /* Don't care about these fields unless we're encryption-capable */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len)
+ ceph_decode_skip_n(p, end, len, bad);
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len)
+ ceph_decode_skip_n(p, end, len, bad);
+ return 0;
+bad:
+ return -EIO;
+}
+#endif
+
/*
* Handle a caps message from the MDS.
*
@@ -4105,6 +4247,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout("handle_caps from mds%d\n", session->s_mds);
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
+
/* decode */
end = msg->front.iov_base + msg->front.iov_len;
if (msg->front.iov_len < sizeof(*h))
@@ -4195,13 +4340,17 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
}
+ if (msg_version >= 12) {
+ if (parse_fscrypt_fields(&p, end, &extra_info))
+ goto bad;
+ }
+
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
mutex_lock(&session->s_mutex);
- inc_session_sequence(session);
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq);
@@ -4292,7 +4441,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
break;
case CEPH_CAP_OP_TRUNC:
- queue_trunc = handle_cap_trunc(inode, h, session);
+ queue_trunc = handle_cap_trunc(inode, h, session,
+ &extra_info);
spin_unlock(&ci->i_ceph_lock);
if (queue_trunc)
ceph_queue_vmtruncate(inode);
@@ -4309,12 +4459,15 @@ done:
done_unlocked:
iput(inode);
out:
+ ceph_dec_mds_stopping_blocker(mdsc);
+
ceph_put_string(extra_info.pool_ns);
/* Defer closing the sessions after s_mutex lock being released */
if (close_sessions)
ceph_mdsc_close_sessions(mdsc);
+ kfree(extra_info.fscrypt_auth);
return;
flush_cap_releases:
@@ -4611,6 +4764,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
return ret;
}
+/**
+ * ceph_encode_dentry_release - encode a dentry release into an outgoing request
+ * @p: outgoing request buffer
+ * @dentry: dentry to release
+ * @dir: dir to release it from
+ * @mds: mds that we're speaking to
+ * @drop: caps being dropped
+ * @unless: unless we have these caps
+ *
+ * Encode a dentry release into an outgoing request buffer. Returns 1 if the
+ * thing was released, or a negative error code otherwise.
+ */
int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless)
@@ -4643,13 +4808,25 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
dout("encode_dentry_release %p mds%d seq %d\n",
dentry, mds, (int)di->lease_seq);
- rel->dname_len = cpu_to_le32(dentry->d_name.len);
- memcpy(*p, dentry->d_name.name, dentry->d_name.len);
- *p += dentry->d_name.len;
rel->dname_seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
+ spin_unlock(&dentry->d_lock);
+ if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
+ int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p);
+
+ if (ret2 < 0)
+ return ret2;
+
+ rel->dname_len = cpu_to_le32(ret2);
+ *p += ret2;
+ } else {
+ rel->dname_len = cpu_to_le32(dentry->d_name.len);
+ memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+ *p += dentry->d_name.len;
+ }
+ } else {
+ spin_unlock(&dentry->d_lock);
}
- spin_unlock(&dentry->d_lock);
return ret;
}
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..5b5112c78462
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,671 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * The base64 encode/decode code was copied from fscrypt:
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ * Written by Uday Savagaonkar, 2014.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#include <linux/ceph/ceph_debug.h>
+#include <linux/xattr.h>
+#include <linux/fscrypt.h>
+#include <linux/ceph/striper.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "crypto.h"
+
+/*
+ * The base64url encoding used by fscrypt includes the '_' character, which may
+ * cause problems in snapshot names (which can not start with '_'). Thus, we
+ * used the base64 encoding defined for IMAP mailbox names (RFC 3501) instead,
+ * which replaces '-' and '_' by '+' and ','.
+ */
+static const char base64_table[65] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+int ceph_base64_encode(const u8 *src, int srclen, char *dst)
+{
+ u32 ac = 0;
+ int bits = 0;
+ int i;
+ char *cp = dst;
+
+ for (i = 0; i < srclen; i++) {
+ ac = (ac << 8) | src[i];
+ bits += 8;
+ do {
+ bits -= 6;
+ *cp++ = base64_table[(ac >> bits) & 0x3f];
+ } while (bits >= 6);
+ }
+ if (bits)
+ *cp++ = base64_table[(ac << (6 - bits)) & 0x3f];
+ return cp - dst;
+}
+
+int ceph_base64_decode(const char *src, int srclen, u8 *dst)
+{
+ u32 ac = 0;
+ int bits = 0;
+ int i;
+ u8 *bp = dst;
+
+ for (i = 0; i < srclen; i++) {
+ const char *p = strchr(base64_table, src[i]);
+
+ if (p == NULL || src[i] == 0)
+ return -1;
+ ac = (ac << 6) | (p - base64_table);
+ bits += 6;
+ if (bits >= 8) {
+ bits -= 8;
+ *bp++ = (u8)(ac >> bits);
+ }
+ }
+ if (ac & ((1 << bits) - 1))
+ return -1;
+ return bp - dst;
+}
+
+static int ceph_crypt_get_context(struct inode *inode, void *ctx, size_t len)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fscrypt_auth *cfa = (struct ceph_fscrypt_auth *)ci->fscrypt_auth;
+ u32 ctxlen;
+
+ /* Non existent or too short? */
+ if (!cfa || (ci->fscrypt_auth_len < (offsetof(struct ceph_fscrypt_auth, cfa_blob) + 1)))
+ return -ENOBUFS;
+
+ /* Some format we don't recognize? */
+ if (le32_to_cpu(cfa->cfa_version) != CEPH_FSCRYPT_AUTH_VERSION)
+ return -ENOBUFS;
+
+ ctxlen = le32_to_cpu(cfa->cfa_blob_len);
+ if (len < ctxlen)
+ return -ERANGE;
+
+ memcpy(ctx, cfa->cfa_blob, ctxlen);
+ return ctxlen;
+}
+
+static int ceph_crypt_set_context(struct inode *inode, const void *ctx,
+ size_t len, void *fs_data)
+{
+ int ret;
+ struct iattr attr = { };
+ struct ceph_iattr cia = { };
+ struct ceph_fscrypt_auth *cfa;
+
+ WARN_ON_ONCE(fs_data);
+
+ if (len > FSCRYPT_SET_CONTEXT_MAX_SIZE)
+ return -EINVAL;
+
+ cfa = kzalloc(sizeof(*cfa), GFP_KERNEL);
+ if (!cfa)
+ return -ENOMEM;
+
+ cfa->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION);
+ cfa->cfa_blob_len = cpu_to_le32(len);
+ memcpy(cfa->cfa_blob, ctx, len);
+
+ cia.fscrypt_auth = cfa;
+
+ ret = __ceph_setattr(inode, &attr, &cia);
+ if (ret == 0)
+ inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+ kfree(cia.fscrypt_auth);
+ return ret;
+}
+
+static bool ceph_crypt_empty_dir(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ return ci->i_rsubdirs + ci->i_rfiles == 1;
+}
+
+static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb)
+{
+ return ceph_sb_to_client(sb)->fsc_dummy_enc_policy.policy;
+}
+
+static struct fscrypt_operations ceph_fscrypt_ops = {
+ .get_context = ceph_crypt_get_context,
+ .set_context = ceph_crypt_set_context,
+ .get_dummy_policy = ceph_get_dummy_policy,
+ .empty_dir = ceph_crypt_empty_dir,
+};
+
+void ceph_fscrypt_set_ops(struct super_block *sb)
+{
+ fscrypt_set_ops(sb, &ceph_fscrypt_ops);
+}
+
+void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc)
+{
+ fscrypt_free_dummy_policy(&fsc->fsc_dummy_enc_policy);
+}
+
+int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode,
+ struct ceph_acl_sec_ctx *as)
+{
+ int ret, ctxsize;
+ bool encrypted = false;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ ret = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+ if (ret)
+ return ret;
+ if (!encrypted)
+ return 0;
+
+ as->fscrypt_auth = kzalloc(sizeof(*as->fscrypt_auth), GFP_KERNEL);
+ if (!as->fscrypt_auth)
+ return -ENOMEM;
+
+ ctxsize = fscrypt_context_for_new_inode(as->fscrypt_auth->cfa_blob,
+ inode);
+ if (ctxsize < 0)
+ return ctxsize;
+
+ as->fscrypt_auth->cfa_version = cpu_to_le32(CEPH_FSCRYPT_AUTH_VERSION);
+ as->fscrypt_auth->cfa_blob_len = cpu_to_le32(ctxsize);
+
+ WARN_ON_ONCE(ci->fscrypt_auth);
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth_len = ceph_fscrypt_auth_len(as->fscrypt_auth);
+ ci->fscrypt_auth = kmemdup(as->fscrypt_auth, ci->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (!ci->fscrypt_auth)
+ return -ENOMEM;
+
+ inode->i_flags |= S_ENCRYPTED;
+
+ return 0;
+}
+
+void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as)
+{
+ swap(req->r_fscrypt_auth, as->fscrypt_auth);
+}
+
+/*
+ * User-created snapshots can't start with '_'. Snapshots that start with this
+ * character are special (hint: there aren't real snapshots) and use the
+ * following format:
+ *
+ * _<SNAPSHOT-NAME>_<INODE-NUMBER>
+ *
+ * where:
+ * - <SNAPSHOT-NAME> - the real snapshot name that may need to be decrypted,
+ * - <INODE-NUMBER> - the inode number (in decimal) for the actual snapshot
+ *
+ * This function parses these snapshot names and returns the inode
+ * <INODE-NUMBER>. 'name_len' will also bet set with the <SNAPSHOT-NAME>
+ * length.
+ */
+static struct inode *parse_longname(const struct inode *parent,
+ const char *name, int *name_len)
+{
+ struct inode *dir = NULL;
+ struct ceph_vino vino = { .snap = CEPH_NOSNAP };
+ char *inode_number;
+ char *name_end;
+ int orig_len = *name_len;
+ int ret = -EIO;
+
+ /* Skip initial '_' */
+ name++;
+ name_end = strrchr(name, '_');
+ if (!name_end) {
+ dout("Failed to parse long snapshot name: %s\n", name);
+ return ERR_PTR(-EIO);
+ }
+ *name_len = (name_end - name);
+ if (*name_len <= 0) {
+ pr_err("Failed to parse long snapshot name\n");
+ return ERR_PTR(-EIO);
+ }
+
+ /* Get the inode number */
+ inode_number = kmemdup_nul(name_end + 1,
+ orig_len - *name_len - 2,
+ GFP_KERNEL);
+ if (!inode_number)
+ return ERR_PTR(-ENOMEM);
+ ret = kstrtou64(inode_number, 10, &vino.ino);
+ if (ret) {
+ dout("Failed to parse inode number: %s\n", name);
+ dir = ERR_PTR(ret);
+ goto out;
+ }
+
+ /* And finally the inode */
+ dir = ceph_find_inode(parent->i_sb, vino);
+ if (!dir) {
+ /* This can happen if we're not mounting cephfs on the root */
+ dir = ceph_get_inode(parent->i_sb, vino, NULL);
+ if (IS_ERR(dir))
+ dout("Can't find inode %s (%s)\n", inode_number, name);
+ }
+
+out:
+ kfree(inode_number);
+ return dir;
+}
+
+int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
+ char *buf)
+{
+ struct inode *dir = parent;
+ struct qstr iname;
+ u32 len;
+ int name_len;
+ int elen;
+ int ret;
+ u8 *cryptbuf = NULL;
+
+ iname.name = d_name->name;
+ name_len = d_name->len;
+
+ /* Handle the special case of snapshot names that start with '_' */
+ if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) &&
+ (iname.name[0] == '_')) {
+ dir = parse_longname(parent, iname.name, &name_len);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+ iname.name++; /* skip initial '_' */
+ }
+ iname.len = name_len;
+
+ if (!fscrypt_has_encryption_key(dir)) {
+ memcpy(buf, d_name->name, d_name->len);
+ elen = d_name->len;
+ goto out;
+ }
+
+ /*
+ * Convert cleartext d_name to ciphertext. If result is longer than
+ * CEPH_NOHASH_NAME_MAX, sha256 the remaining bytes
+ *
+ * See: fscrypt_setup_filename
+ */
+ if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) {
+ elen = -ENAMETOOLONG;
+ goto out;
+ }
+
+ /* Allocate a buffer appropriate to hold the result */
+ cryptbuf = kmalloc(len > CEPH_NOHASH_NAME_MAX ? NAME_MAX : len,
+ GFP_KERNEL);
+ if (!cryptbuf) {
+ elen = -ENOMEM;
+ goto out;
+ }
+
+ ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len);
+ if (ret) {
+ elen = ret;
+ goto out;
+ }
+
+ /* hash the end if the name is long enough */
+ if (len > CEPH_NOHASH_NAME_MAX) {
+ u8 hash[SHA256_DIGEST_SIZE];
+ u8 *extra = cryptbuf + CEPH_NOHASH_NAME_MAX;
+
+ /*
+ * hash the extra bytes and overwrite crypttext beyond that
+ * point with it
+ */
+ sha256(extra, len - CEPH_NOHASH_NAME_MAX, hash);
+ memcpy(extra, hash, SHA256_DIGEST_SIZE);
+ len = CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE;
+ }
+
+ /* base64 encode the encrypted name */
+ elen = ceph_base64_encode(cryptbuf, len, buf);
+ dout("base64-encoded ciphertext name = %.*s\n", elen, buf);
+
+ /* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
+ WARN_ON(elen > 240);
+ if ((elen > 0) && (dir != parent)) {
+ char tmp_buf[NAME_MAX];
+
+ elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld",
+ elen, buf, dir->i_ino);
+ memcpy(buf, tmp_buf, elen);
+ }
+
+out:
+ kfree(cryptbuf);
+ if (dir != parent) {
+ if ((dir->i_state & I_NEW))
+ discard_new_inode(dir);
+ else
+ iput(dir);
+ }
+ return elen;
+}
+
+int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
+ char *buf)
+{
+ WARN_ON_ONCE(!fscrypt_has_encryption_key(parent));
+
+ return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf);
+}
+
+/**
+ * ceph_fname_to_usr - convert a filename for userland presentation
+ * @fname: ceph_fname to be converted
+ * @tname: temporary name buffer to use for conversion (may be NULL)
+ * @oname: where converted name should be placed
+ * @is_nokey: set to true if key wasn't available during conversion (may be NULL)
+ *
+ * Given a filename (usually from the MDS), format it for presentation to
+ * userland. If @parent is not encrypted, just pass it back as-is.
+ *
+ * Otherwise, base64 decode the string, and then ask fscrypt to format it
+ * for userland presentation.
+ *
+ * Returns 0 on success or negative error code on error.
+ */
+int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname,
+ struct fscrypt_str *oname, bool *is_nokey)
+{
+ struct inode *dir = fname->dir;
+ struct fscrypt_str _tname = FSTR_INIT(NULL, 0);
+ struct fscrypt_str iname;
+ char *name = fname->name;
+ int name_len = fname->name_len;
+ int ret;
+
+ /* Sanity check that the resulting name will fit in the buffer */
+ if (fname->name_len > NAME_MAX || fname->ctext_len > NAME_MAX)
+ return -EIO;
+
+ /* Handle the special case of snapshot names that start with '_' */
+ if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) &&
+ (name[0] == '_')) {
+ dir = parse_longname(dir, name, &name_len);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+ name++; /* skip initial '_' */
+ }
+
+ if (!IS_ENCRYPTED(dir)) {
+ oname->name = fname->name;
+ oname->len = fname->name_len;
+ ret = 0;
+ goto out_inode;
+ }
+
+ ret = ceph_fscrypt_prepare_readdir(dir);
+ if (ret)
+ goto out_inode;
+
+ /*
+ * Use the raw dentry name as sent by the MDS instead of
+ * generating a nokey name via fscrypt.
+ */
+ if (!fscrypt_has_encryption_key(dir)) {
+ if (fname->no_copy)
+ oname->name = fname->name;
+ else
+ memcpy(oname->name, fname->name, fname->name_len);
+ oname->len = fname->name_len;
+ if (is_nokey)
+ *is_nokey = true;
+ ret = 0;
+ goto out_inode;
+ }
+
+ if (fname->ctext_len == 0) {
+ int declen;
+
+ if (!tname) {
+ ret = fscrypt_fname_alloc_buffer(NAME_MAX, &_tname);
+ if (ret)
+ goto out_inode;
+ tname = &_tname;
+ }
+
+ declen = ceph_base64_decode(name, name_len, tname->name);
+ if (declen <= 0) {
+ ret = -EIO;
+ goto out;
+ }
+ iname.name = tname->name;
+ iname.len = declen;
+ } else {
+ iname.name = fname->ctext;
+ iname.len = fname->ctext_len;
+ }
+
+ ret = fscrypt_fname_disk_to_usr(dir, 0, 0, &iname, oname);
+ if (!ret && (dir != fname->dir)) {
+ char tmp_buf[CEPH_BASE64_CHARS(NAME_MAX)];
+
+ name_len = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld",
+ oname->len, oname->name, dir->i_ino);
+ memcpy(oname->name, tmp_buf, name_len);
+ oname->len = name_len;
+ }
+
+out:
+ fscrypt_fname_free_buffer(&_tname);
+out_inode:
+ if (dir != fname->dir) {
+ if ((dir->i_state & I_NEW))
+ discard_new_inode(dir);
+ else
+ iput(dir);
+ }
+ return ret;
+}
+
+/**
+ * ceph_fscrypt_prepare_readdir - simple __fscrypt_prepare_readdir() wrapper
+ * @dir: directory inode for readdir prep
+ *
+ * Simple wrapper around __fscrypt_prepare_readdir() that will mark directory as
+ * non-complete if this call results in having the directory unlocked.
+ *
+ * Returns:
+ * 1 - if directory was locked and key is now loaded (i.e. dir is unlocked)
+ * 0 - if directory is still locked
+ * < 0 - if __fscrypt_prepare_readdir() fails
+ */
+int ceph_fscrypt_prepare_readdir(struct inode *dir)
+{
+ bool had_key = fscrypt_has_encryption_key(dir);
+ int err;
+
+ if (!IS_ENCRYPTED(dir))
+ return 0;
+
+ err = __fscrypt_prepare_readdir(dir);
+ if (err)
+ return err;
+ if (!had_key && fscrypt_has_encryption_key(dir)) {
+ /* directory just got unlocked, mark it as not complete */
+ ceph_dir_clear_complete(dir);
+ return 1;
+ }
+ return 0;
+}
+
+int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num)
+{
+ dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+ return fscrypt_decrypt_block_inplace(inode, page, len, offs, lblk_num);
+}
+
+int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num,
+ gfp_t gfp_flags)
+{
+ dout("%s: len %u offs %u blk %llu\n", __func__, len, offs, lblk_num);
+ return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
+ gfp_flags);
+}
+
+/**
+ * ceph_fscrypt_decrypt_pages - decrypt an array of pages
+ * @inode: pointer to inode associated with these pages
+ * @page: pointer to page array
+ * @off: offset into the file that the read data starts
+ * @len: max length to decrypt
+ *
+ * Decrypt an array of fscrypt'ed pages and return the amount of
+ * data decrypted. Any data in the page prior to the start of the
+ * first complete block in the read is ignored. Any incomplete
+ * crypto blocks at the end of the array are ignored (and should
+ * probably be zeroed by the caller).
+ *
+ * Returns the length of the decrypted data or a negative errno.
+ */
+int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page,
+ u64 off, int len)
+{
+ int i, num_blocks;
+ u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ int ret = 0;
+
+ /*
+ * We can't deal with partial blocks on an encrypted file, so mask off
+ * the last bit.
+ */
+ num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK);
+
+ /* Decrypt each block */
+ for (i = 0; i < num_blocks; ++i) {
+ int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT;
+ int pgidx = blkoff >> PAGE_SHIFT;
+ unsigned int pgoffs = offset_in_page(blkoff);
+ int fret;
+
+ fret = ceph_fscrypt_decrypt_block_inplace(inode, page[pgidx],
+ CEPH_FSCRYPT_BLOCK_SIZE, pgoffs,
+ baseblk + i);
+ if (fret < 0) {
+ if (ret == 0)
+ ret = fret;
+ break;
+ }
+ ret += CEPH_FSCRYPT_BLOCK_SIZE;
+ }
+ return ret;
+}
+
+/**
+ * ceph_fscrypt_decrypt_extents: decrypt received extents in given buffer
+ * @inode: inode associated with pages being decrypted
+ * @page: pointer to page array
+ * @off: offset into the file that the data in page[0] starts
+ * @map: pointer to extent array
+ * @ext_cnt: length of extent array
+ *
+ * Given an extent map and a page array, decrypt the received data in-place,
+ * skipping holes. Returns the offset into buffer of end of last decrypted
+ * block.
+ */
+int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
+ u64 off, struct ceph_sparse_extent *map,
+ u32 ext_cnt)
+{
+ int i, ret = 0;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objno, objoff;
+ u32 xlen;
+
+ /* Nothing to do for empty array */
+ if (ext_cnt == 0) {
+ dout("%s: empty array, ret 0\n", __func__);
+ return 0;
+ }
+
+ ceph_calc_file_object_mapping(&ci->i_layout, off, map[0].len,
+ &objno, &objoff, &xlen);
+
+ for (i = 0; i < ext_cnt; ++i) {
+ struct ceph_sparse_extent *ext = &map[i];
+ int pgsoff = ext->off - objoff;
+ int pgidx = pgsoff >> PAGE_SHIFT;
+ int fret;
+
+ if ((ext->off | ext->len) & ~CEPH_FSCRYPT_BLOCK_MASK) {
+ pr_warn("%s: bad encrypted sparse extent idx %d off %llx len %llx\n",
+ __func__, i, ext->off, ext->len);
+ return -EIO;
+ }
+ fret = ceph_fscrypt_decrypt_pages(inode, &page[pgidx],
+ off + pgsoff, ext->len);
+ dout("%s: [%d] 0x%llx~0x%llx fret %d\n", __func__, i,
+ ext->off, ext->len, fret);
+ if (fret < 0) {
+ if (ret == 0)
+ ret = fret;
+ break;
+ }
+ ret = pgsoff + fret;
+ }
+ dout("%s: ret %d\n", __func__, ret);
+ return ret;
+}
+
+/**
+ * ceph_fscrypt_encrypt_pages - encrypt an array of pages
+ * @inode: pointer to inode associated with these pages
+ * @page: pointer to page array
+ * @off: offset into the file that the data starts
+ * @len: max length to encrypt
+ * @gfp: gfp flags to use for allocation
+ *
+ * Decrypt an array of cleartext pages and return the amount of
+ * data encrypted. Any data in the page prior to the start of the
+ * first complete block in the read is ignored. Any incomplete
+ * crypto blocks at the end of the array are ignored.
+ *
+ * Returns the length of the encrypted data or a negative errno.
+ */
+int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
+ int len, gfp_t gfp)
+{
+ int i, num_blocks;
+ u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ int ret = 0;
+
+ /*
+ * We can't deal with partial blocks on an encrypted file, so mask off
+ * the last bit.
+ */
+ num_blocks = ceph_fscrypt_blocks(off, len & CEPH_FSCRYPT_BLOCK_MASK);
+
+ /* Encrypt each block */
+ for (i = 0; i < num_blocks; ++i) {
+ int blkoff = i << CEPH_FSCRYPT_BLOCK_SHIFT;
+ int pgidx = blkoff >> PAGE_SHIFT;
+ unsigned int pgoffs = offset_in_page(blkoff);
+ int fret;
+
+ fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx],
+ CEPH_FSCRYPT_BLOCK_SIZE, pgoffs,
+ baseblk + i, gfp);
+ if (fret < 0) {
+ if (ret == 0)
+ ret = fret;
+ break;
+ }
+ ret += CEPH_FSCRYPT_BLOCK_SIZE;
+ }
+ return ret;
+}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..47e0c319fc68
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,288 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Ceph fscrypt functionality
+ */
+
+#ifndef _CEPH_CRYPTO_H
+#define _CEPH_CRYPTO_H
+
+#include <crypto/sha2.h>
+#include <linux/fscrypt.h>
+
+#define CEPH_FSCRYPT_BLOCK_SHIFT 12
+#define CEPH_FSCRYPT_BLOCK_SIZE (_AC(1, UL) << CEPH_FSCRYPT_BLOCK_SHIFT)
+#define CEPH_FSCRYPT_BLOCK_MASK (~(CEPH_FSCRYPT_BLOCK_SIZE-1))
+
+struct ceph_fs_client;
+struct ceph_acl_sec_ctx;
+struct ceph_mds_request;
+
+struct ceph_fname {
+ struct inode *dir;
+ char *name; // b64 encoded, possibly hashed
+ unsigned char *ctext; // binary crypttext (if any)
+ u32 name_len; // length of name buffer
+ u32 ctext_len; // length of crypttext
+ bool no_copy;
+};
+
+/*
+ * Header for the crypted file when truncating the size, this
+ * will be sent to MDS, and the MDS will update the encrypted
+ * last block and then truncate the size.
+ */
+struct ceph_fscrypt_truncate_size_header {
+ __u8 ver;
+ __u8 compat;
+
+ /*
+ * It will be sizeof(assert_ver + file_offset + block_size)
+ * if the last block is empty when it's located in a file
+ * hole. Or the data_len will plus CEPH_FSCRYPT_BLOCK_SIZE.
+ */
+ __le32 data_len;
+
+ __le64 change_attr;
+ __le64 file_offset;
+ __le32 block_size;
+} __packed;
+
+struct ceph_fscrypt_auth {
+ __le32 cfa_version;
+ __le32 cfa_blob_len;
+ u8 cfa_blob[FSCRYPT_SET_CONTEXT_MAX_SIZE];
+} __packed;
+
+#define CEPH_FSCRYPT_AUTH_VERSION 1
+static inline u32 ceph_fscrypt_auth_len(struct ceph_fscrypt_auth *fa)
+{
+ u32 ctxsize = le32_to_cpu(fa->cfa_blob_len);
+
+ return offsetof(struct ceph_fscrypt_auth, cfa_blob) + ctxsize;
+}
+
+#ifdef CONFIG_FS_ENCRYPTION
+/*
+ * We want to encrypt filenames when creating them, but the encrypted
+ * versions of those names may have illegal characters in them. To mitigate
+ * that, we base64 encode them, but that gives us a result that can exceed
+ * NAME_MAX.
+ *
+ * Follow a similar scheme to fscrypt itself, and cap the filename to a
+ * smaller size. If the ciphertext name is longer than the value below, then
+ * sha256 hash the remaining bytes.
+ *
+ * For the fscrypt_nokey_name struct the dirhash[2] member is useless in ceph
+ * so the corresponding struct will be:
+ *
+ * struct fscrypt_ceph_nokey_name {
+ * u8 bytes[157];
+ * u8 sha256[SHA256_DIGEST_SIZE];
+ * }; // 180 bytes => 240 bytes base64-encoded, which is <= NAME_MAX (255)
+ *
+ * (240 bytes is the maximum size allowed for snapshot names to take into
+ * account the format: '_<SNAPSHOT-NAME>_<INODE-NUMBER>'.)
+ *
+ * Note that for long names that end up having their tail portion hashed, we
+ * must also store the full encrypted name (in the dentry's alternate_name
+ * field).
+ */
+#define CEPH_NOHASH_NAME_MAX (180 - SHA256_DIGEST_SIZE)
+
+#define CEPH_BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3)
+
+int ceph_base64_encode(const u8 *src, int srclen, char *dst);
+int ceph_base64_decode(const char *src, int srclen, u8 *dst);
+
+void ceph_fscrypt_set_ops(struct super_block *sb);
+
+void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc);
+
+int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode,
+ struct ceph_acl_sec_ctx *as);
+void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as);
+int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
+ char *buf);
+int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
+ char *buf);
+
+static inline int ceph_fname_alloc_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+ if (!IS_ENCRYPTED(parent))
+ return 0;
+ return fscrypt_fname_alloc_buffer(NAME_MAX, fname);
+}
+
+static inline void ceph_fname_free_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+ if (IS_ENCRYPTED(parent))
+ fscrypt_fname_free_buffer(fname);
+}
+
+int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname,
+ struct fscrypt_str *oname, bool *is_nokey);
+int ceph_fscrypt_prepare_readdir(struct inode *dir);
+
+static inline unsigned int ceph_fscrypt_blocks(u64 off, u64 len)
+{
+ /* crypto blocks cannot span more than one page */
+ BUILD_BUG_ON(CEPH_FSCRYPT_BLOCK_SHIFT > PAGE_SHIFT);
+
+ return ((off+len+CEPH_FSCRYPT_BLOCK_SIZE-1) >> CEPH_FSCRYPT_BLOCK_SHIFT) -
+ (off >> CEPH_FSCRYPT_BLOCK_SHIFT);
+}
+
+/*
+ * If we have an encrypted inode then we must adjust the offset and
+ * range of the on-the-wire read to cover an entire encryption block.
+ * The copy will be done using the original offset and length, after
+ * we've decrypted the result.
+ */
+static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode,
+ u64 *off, u64 *len)
+{
+ if (IS_ENCRYPTED(inode)) {
+ *len = ceph_fscrypt_blocks(*off, *len) * CEPH_FSCRYPT_BLOCK_SIZE;
+ *off &= CEPH_FSCRYPT_BLOCK_MASK;
+ }
+}
+
+int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num);
+int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num,
+ gfp_t gfp_flags);
+int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page,
+ u64 off, int len);
+int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
+ u64 off, struct ceph_sparse_extent *map,
+ u32 ext_cnt);
+int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
+ int len, gfp_t gfp);
+
+static inline struct page *ceph_fscrypt_pagecache_page(struct page *page)
+{
+ return fscrypt_is_bounce_page(page) ? fscrypt_pagecache_page(page) : page;
+}
+
+#else /* CONFIG_FS_ENCRYPTION */
+
+static inline void ceph_fscrypt_set_ops(struct super_block *sb)
+{
+}
+
+static inline void ceph_fscrypt_free_dummy_policy(struct ceph_fs_client *fsc)
+{
+}
+
+static inline int ceph_fscrypt_prepare_context(struct inode *dir,
+ struct inode *inode,
+ struct ceph_acl_sec_ctx *as)
+{
+ if (IS_ENCRYPTED(dir))
+ return -EOPNOTSUPP;
+ return 0;
+}
+
+static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+}
+
+static inline int ceph_encode_encrypted_dname(struct inode *parent,
+ struct qstr *d_name, char *buf)
+{
+ memcpy(buf, d_name->name, d_name->len);
+ return d_name->len;
+}
+
+static inline int ceph_encode_encrypted_fname(struct inode *parent,
+ struct dentry *dentry, char *buf)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int ceph_fname_alloc_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+ return 0;
+}
+
+static inline void ceph_fname_free_buffer(struct inode *parent,
+ struct fscrypt_str *fname)
+{
+}
+
+static inline int ceph_fname_to_usr(const struct ceph_fname *fname,
+ struct fscrypt_str *tname,
+ struct fscrypt_str *oname, bool *is_nokey)
+{
+ oname->name = fname->name;
+ oname->len = fname->name_len;
+ return 0;
+}
+
+static inline int ceph_fscrypt_prepare_readdir(struct inode *dir)
+{
+ return 0;
+}
+
+static inline void ceph_fscrypt_adjust_off_and_len(struct inode *inode,
+ u64 *off, u64 *len)
+{
+}
+
+static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
+ struct page *page, unsigned int len,
+ unsigned int offs, u64 lblk_num,
+ gfp_t gfp_flags)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_decrypt_pages(struct inode *inode,
+ struct page **page, u64 off,
+ int len)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_decrypt_extents(struct inode *inode,
+ struct page **page, u64 off,
+ struct ceph_sparse_extent *map,
+ u32 ext_cnt)
+{
+ return 0;
+}
+
+static inline int ceph_fscrypt_encrypt_pages(struct inode *inode,
+ struct page **page, u64 off,
+ int len, gfp_t gfp)
+{
+ return 0;
+}
+
+static inline struct page *ceph_fscrypt_pagecache_page(struct page *page)
+{
+ return page;
+}
+#endif /* CONFIG_FS_ENCRYPTION */
+
+static inline loff_t ceph_fscrypt_page_offset(struct page *page)
+{
+ return page_offset(ceph_fscrypt_pagecache_page(page));
+}
+
+#endif /* _CEPH_CRYPTO_H */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index bdcffb04513f..854cbdd66661 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -9,6 +9,7 @@
#include "super.h"
#include "mds_client.h"
+#include "crypto.h"
/*
* Directory operations: readdir, lookup, create, link, unlink,
@@ -241,7 +242,9 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
di = ceph_dentry(dentry);
if (d_unhashed(dentry) ||
d_really_is_negative(dentry) ||
- di->lease_shared_gen != shared_gen) {
+ di->lease_shared_gen != shared_gen ||
+ ((dentry->d_flags & DCACHE_NOKEY_NAME) &&
+ fscrypt_has_encryption_key(dir))) {
spin_unlock(&dentry->d_lock);
dput(dentry);
err = -EAGAIN;
@@ -340,6 +343,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ctx->pos = 2;
}
+ err = ceph_fscrypt_prepare_readdir(inode);
+ if (err < 0)
+ return err;
+
spin_lock(&ci->i_ceph_lock);
/* request Fx cap. if have Fx, we don't need to release Fs cap
* for later create/unlink. */
@@ -389,6 +396,7 @@ more:
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+
err = ceph_alloc_readdir_reply_buffer(req, inode);
if (err) {
ceph_mdsc_put_request(req);
@@ -402,11 +410,21 @@ more:
req->r_inode_drop = CEPH_CAP_FILE_EXCL;
}
if (dfi->last_name) {
- req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL);
+ struct qstr d_name = { .name = dfi->last_name,
+ .len = strlen(dfi->last_name) };
+
+ req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
return -ENOMEM;
}
+
+ err = ceph_encode_encrypted_dname(inode, &d_name,
+ req->r_path2);
+ if (err < 0) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
} else if (is_hash_order(ctx->pos)) {
req->r_args.readdir.offset_hash =
cpu_to_le32(fpos_hash(ctx->pos));
@@ -511,15 +529,20 @@ more:
for (; i < rinfo->dir_nr; i++) {
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
- BUG_ON(rde->offset < ctx->pos);
+ if (rde->offset < ctx->pos) {
+ pr_warn("%s: rde->offset 0x%llx ctx->pos 0x%llx\n",
+ __func__, rde->offset, ctx->pos);
+ return -EIO;
+ }
+
+ if (WARN_ON_ONCE(!rde->inode.in))
+ return -EIO;
ctx->pos = rde->offset;
dout("readdir (%d/%d) -> %llx '%.*s' %p\n",
i, rinfo->dir_nr, ctx->pos,
rde->name_len, rde->name, &rde->inode.in);
- BUG_ON(!rde->inode.in);
-
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
le32_to_cpu(rde->inode.in->mode) >> 12)) {
@@ -532,6 +555,8 @@ more:
dout("filldir stopping us...\n");
return 0;
}
+
+ /* Reset the lengths to their original allocated vals */
ctx->pos++;
}
@@ -586,7 +611,6 @@ more:
dfi->dir_ordered_count);
spin_unlock(&ci->i_ceph_lock);
}
-
dout("readdir %p file %p done.\n", inode, file);
return 0;
}
@@ -760,6 +784,18 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
+ if (IS_ENCRYPTED(dir)) {
+ bool had_key = fscrypt_has_encryption_key(dir);
+
+ err = fscrypt_prepare_lookup_partial(dir, dentry);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* mark directory as incomplete if it has been unlocked */
+ if (!had_key && fscrypt_has_encryption_key(dir))
+ ceph_dir_clear_complete(dir);
+ }
+
/* can we conclude ENOENT locally? */
if (d_really_is_negative(dentry)) {
struct ceph_inode_info *ci = ceph_inode(dir);
@@ -865,13 +901,6 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
- err = ceph_pre_init_acls(dir, &mode, &as_ctx);
- if (err < 0)
- goto out;
- err = ceph_security_init_secctx(dentry, mode, &as_ctx);
- if (err < 0)
- goto out;
-
dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
dir, dentry, mode, rdev);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
@@ -879,6 +908,17 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
err = PTR_ERR(req);
goto out;
}
+
+ req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(req->r_new_inode)) {
+ err = PTR_ERR(req->r_new_inode);
+ req->r_new_inode = NULL;
+ goto out_req;
+ }
+
+ if (S_ISREG(mode) && IS_ENCRYPTED(dir))
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
@@ -889,13 +929,13 @@ static int ceph_mknod(struct mnt_idmap *idmap, struct inode *dir,
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- if (as_ctx.pagelist) {
- req->r_pagelist = as_ctx.pagelist;
- as_ctx.pagelist = NULL;
- }
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+out_req:
ceph_mdsc_put_request(req);
out:
if (!err)
@@ -912,12 +952,50 @@ static int ceph_create(struct mnt_idmap *idmap, struct inode *dir,
return ceph_mknod(idmap, dir, dentry, mode, 0);
}
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
+ const char *dest)
+{
+ int err;
+ int len = strlen(dest);
+ struct fscrypt_str osd_link = FSTR_INIT(NULL, 0);
+
+ err = fscrypt_prepare_symlink(req->r_parent, dest, len, PATH_MAX,
+ &osd_link);
+ if (err)
+ goto out;
+
+ err = fscrypt_encrypt_symlink(req->r_new_inode, dest, len, &osd_link);
+ if (err)
+ goto out;
+
+ req->r_path2 = kmalloc(CEPH_BASE64_CHARS(osd_link.len) + 1, GFP_KERNEL);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ len = ceph_base64_encode(osd_link.name, osd_link.len, req->r_path2);
+ req->r_path2[len] = '\0';
+out:
+ fscrypt_fname_free_buffer(&osd_link);
+ return err;
+}
+#else
+static int prep_encrypted_symlink_target(struct ceph_mds_request *req,
+ const char *dest)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *dest)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
+ umode_t mode = S_IFLNK | 0777;
int err;
if (ceph_snap(dir) != CEPH_NOSNAP)
@@ -932,38 +1010,48 @@ static int ceph_symlink(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
- err = ceph_security_init_secctx(dentry, S_IFLNK | 0777, &as_ctx);
- if (err < 0)
- goto out;
-
dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto out;
}
- req->r_path2 = kstrdup(dest, GFP_KERNEL);
- if (!req->r_path2) {
- err = -ENOMEM;
- ceph_mdsc_put_request(req);
- goto out;
+
+ req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(req->r_new_inode)) {
+ err = PTR_ERR(req->r_new_inode);
+ req->r_new_inode = NULL;
+ goto out_req;
}
+
req->r_parent = dir;
ihold(dir);
+ if (IS_ENCRYPTED(req->r_new_inode)) {
+ err = prep_encrypted_symlink_target(req, dest);
+ if (err)
+ goto out_req;
+ } else {
+ req->r_path2 = kstrdup(dest, GFP_KERNEL);
+ if (!req->r_path2) {
+ err = -ENOMEM;
+ goto out_req;
+ }
+ }
+
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- if (as_ctx.pagelist) {
- req->r_pagelist = as_ctx.pagelist;
- as_ctx.pagelist = NULL;
- }
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+out_req:
ceph_mdsc_put_request(req);
out:
if (err)
@@ -1003,14 +1091,12 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
err = -EDQUOT;
goto out;
}
-
- mode |= S_IFDIR;
- err = ceph_pre_init_acls(dir, &mode, &as_ctx);
- if (err < 0)
- goto out;
- err = ceph_security_init_secctx(dentry, mode, &as_ctx);
- if (err < 0)
+ if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) &&
+ !fscrypt_has_encryption_key(dir)) {
+ err = -ENOKEY;
goto out;
+ }
+
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
@@ -1018,6 +1104,14 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto out;
}
+ mode |= S_IFDIR;
+ req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(req->r_new_inode)) {
+ err = PTR_ERR(req->r_new_inode);
+ req->r_new_inode = NULL;
+ goto out_req;
+ }
+
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
req->r_parent = dir;
@@ -1027,15 +1121,15 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- if (as_ctx.pagelist) {
- req->r_pagelist = as_ctx.pagelist;
- as_ctx.pagelist = NULL;
- }
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
err = ceph_mdsc_do_request(mdsc, dir, req);
if (!err &&
!req->r_reply_info.head->is_target &&
!req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+out_req:
ceph_mdsc_put_request(req);
out:
if (!err)
@@ -1063,6 +1157,10 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
if (ceph_snap(dir) != CEPH_NOSNAP)
return -EROFS;
+ err = fscrypt_prepare_link(old_dentry, dir, dentry);
+ if (err)
+ return err;
+
dout("link in dir %p %llx.%llx old_dentry %p:'%pd' dentry %p:'%pd'\n",
dir, ceph_vinop(dir), old_dentry, old_dentry, dentry, dentry);
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
@@ -1310,6 +1408,11 @@ static int ceph_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (err)
return err;
+ err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
+ if (err)
+ return err;
+
dout("rename dir %p dentry %p to dir %p dentry %p\n",
old_dir, old_dentry, new_dir, new_dentry);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
@@ -1765,6 +1868,10 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct inode *dir, *inode;
struct ceph_mds_client *mdsc;
+ valid = fscrypt_d_revalidate(dentry, flags);
+ if (valid <= 0)
+ return valid;
+
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
@@ -1777,8 +1884,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
inode = d_inode(dentry);
}
- dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
- dentry, inode, ceph_dentry(dentry)->offset);
+ dout("d_revalidate %p '%pd' inode %p offset 0x%llx nokey %d\n", dentry,
+ dentry, inode, ceph_dentry(dentry)->offset,
+ !!(dentry->d_flags & DCACHE_NOKEY_NAME));
mdsc = ceph_sb_to_client(dir->i_sb)->mdsc;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index f780e4e0d062..8559990a59a5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -7,6 +7,7 @@
#include "super.h"
#include "mds_client.h"
+#include "crypto.h"
/*
* Basic fh
@@ -535,7 +536,9 @@ static int ceph_get_name(struct dentry *parent, char *name,
{
struct ceph_mds_client *mdsc;
struct ceph_mds_request *req;
+ struct inode *dir = d_inode(parent);
struct inode *inode = d_inode(child);
+ struct ceph_mds_reply_info_parsed *rinfo;
int err;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -547,30 +550,47 @@ static int ceph_get_name(struct dentry *parent, char *name,
if (IS_ERR(req))
return PTR_ERR(req);
- inode_lock(d_inode(parent));
-
+ inode_lock(dir);
req->r_inode = inode;
ihold(inode);
req->r_ino2 = ceph_vino(d_inode(parent));
- req->r_parent = d_inode(parent);
- ihold(req->r_parent);
+ req->r_parent = dir;
+ ihold(dir);
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
req->r_num_caps = 2;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode_unlock(dir);
- inode_unlock(d_inode(parent));
+ if (err)
+ goto out;
- if (!err) {
- struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ rinfo = &req->r_reply_info;
+ if (!IS_ENCRYPTED(dir)) {
memcpy(name, rinfo->dname, rinfo->dname_len);
name[rinfo->dname_len] = 0;
- dout("get_name %p ino %llx.%llx name %s\n",
- child, ceph_vinop(inode), name);
} else {
- dout("get_name %p ino %llx.%llx err %d\n",
- child, ceph_vinop(inode), err);
- }
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+ struct ceph_fname fname = { .dir = dir,
+ .name = rinfo->dname,
+ .ctext = rinfo->altname,
+ .name_len = rinfo->dname_len,
+ .ctext_len = rinfo->altname_len };
+
+ err = ceph_fname_alloc_buffer(dir, &oname);
+ if (err < 0)
+ goto out;
+ err = ceph_fname_to_usr(&fname, NULL, &oname, NULL);
+ if (!err) {
+ memcpy(name, oname.name, oname.len);
+ name[oname.len] = 0;
+ }
+ ceph_fname_free_buffer(dir, &oname);
+ }
+out:
+ dout("get_name %p ino %llx.%llx err %d %s%s\n",
+ child, ceph_vinop(inode), err,
+ err ? "" : "name ", err ? "" : name);
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 63efe5389783..b5f8038065d7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -366,8 +366,13 @@ int ceph_open(struct inode *inode, struct file *file)
/* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
flags = file->f_flags & ~(O_CREAT|O_EXCL);
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode)) {
flags = O_DIRECTORY; /* mds likes to know */
+ } else if (S_ISREG(inode->i_mode)) {
+ err = fscrypt_file_open(inode, file);
+ if (err)
+ return err;
+ }
dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
ceph_vinop(inode), file, flags, file->f_flags);
@@ -604,7 +609,8 @@ out:
ceph_mdsc_release_dir_caps(req);
}
-static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
+static int ceph_finish_async_create(struct inode *dir, struct inode *inode,
+ struct dentry *dentry,
struct file *file, umode_t mode,
struct ceph_mds_request *req,
struct ceph_acl_sec_ctx *as_ctx,
@@ -616,7 +622,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
struct ceph_mds_reply_info_in iinfo = { .in = &in };
struct ceph_inode_info *ci = ceph_inode(dir);
struct ceph_dentry_info *di = ceph_dentry(dentry);
- struct inode *inode;
struct timespec64 now;
struct ceph_string *pool_ns;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
@@ -625,10 +630,6 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
ktime_get_real_ts64(&now);
- inode = ceph_get_inode(dentry->d_sb, vino);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
iinfo.inline_version = CEPH_INLINE_NONE;
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
@@ -686,8 +687,7 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
ceph_dir_clear_complete(dir);
if (!d_unhashed(dentry))
d_drop(dentry);
- if (inode->i_state & I_NEW)
- discard_new_inode(inode);
+ discard_new_inode(inode);
} else {
struct dentry *dn;
@@ -733,6 +733,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
+ struct inode *new_inode = NULL;
struct dentry *dn;
struct ceph_acl_sec_ctx as_ctx = {};
bool try_async = ceph_test_mount_opt(fsc, ASYNC_DIROPS);
@@ -755,15 +756,16 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
*/
flags &= ~O_TRUNC;
+retry:
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
- err = ceph_pre_init_acls(dir, &mode, &as_ctx);
- if (err < 0)
- return err;
- err = ceph_security_init_secctx(dentry, mode, &as_ctx);
- if (err < 0)
+
+ new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
+ if (IS_ERR(new_inode)) {
+ err = PTR_ERR(new_inode);
goto out_ctx;
+ }
/* Async create can't handle more than a page of xattrs */
if (as_ctx.pagelist &&
!list_is_singular(&as_ctx.pagelist->head))
@@ -772,7 +774,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
/* If it's not being looked up, it's negative */
return -ENOENT;
}
-retry:
+
/* do the open */
req = prepare_open_request(dir->i_sb, flags, mode);
if (IS_ERR(req)) {
@@ -787,6 +789,12 @@ retry:
req->r_args.open.mask = cpu_to_le32(mask);
req->r_parent = dir;
ihold(dir);
+ if (IS_ENCRYPTED(dir)) {
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ err = fscrypt_prepare_lookup_partial(dir, dentry);
+ if (err < 0)
+ goto out_req;
+ }
if (flags & O_CREAT) {
struct ceph_file_layout lo;
@@ -794,32 +802,47 @@ retry:
req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL |
CEPH_CAP_XATTR_EXCL;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
- if (as_ctx.pagelist) {
- req->r_pagelist = as_ctx.pagelist;
- as_ctx.pagelist = NULL;
- }
- if (try_async &&
- (req->r_dir_caps =
- try_prep_async_create(dir, dentry, &lo,
- &req->r_deleg_ino))) {
+
+ ceph_as_ctx_to_req(req, &as_ctx);
+
+ if (try_async && (req->r_dir_caps =
+ try_prep_async_create(dir, dentry, &lo,
+ &req->r_deleg_ino))) {
+ struct ceph_vino vino = { .ino = req->r_deleg_ino,
+ .snap = CEPH_NOSNAP };
struct ceph_dentry_info *di = ceph_dentry(dentry);
set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL);
req->r_callback = ceph_async_create_cb;
+ /* Hash inode before RPC */
+ new_inode = ceph_get_inode(dir->i_sb, vino, new_inode);
+ if (IS_ERR(new_inode)) {
+ err = PTR_ERR(new_inode);
+ new_inode = NULL;
+ goto out_req;
+ }
+ WARN_ON_ONCE(!(new_inode->i_state & I_NEW));
+
spin_lock(&dentry->d_lock);
di->flags |= CEPH_DENTRY_ASYNC_CREATE;
spin_unlock(&dentry->d_lock);
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err) {
- err = ceph_finish_async_create(dir, dentry,
- file, mode, req,
- &as_ctx, &lo);
+ err = ceph_finish_async_create(dir, new_inode,
+ dentry, file,
+ mode, req,
+ &as_ctx, &lo);
+ new_inode = NULL;
} else if (err == -EJUKEBOX) {
restore_deleg_ino(dir, req->r_deleg_ino);
ceph_mdsc_put_request(req);
+ discard_new_inode(new_inode);
+ ceph_release_acl_sec_ctx(&as_ctx);
+ memset(&as_ctx, 0, sizeof(as_ctx));
+ new_inode = NULL;
try_async = false;
ceph_put_string(rcu_dereference_raw(lo.pool_ns));
goto retry;
@@ -830,6 +853,8 @@ retry:
}
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
+ req->r_new_inode = new_inode;
+ new_inode = NULL;
err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
if (err == -ENOENT) {
dentry = ceph_handle_snapdir(req, dentry);
@@ -858,6 +883,13 @@ retry:
dout("atomic_open finish_no_open on dn %p\n", dn);
err = finish_no_open(file, dn);
} else {
+ if (IS_ENCRYPTED(dir) &&
+ !fscrypt_has_permitted_context(dir, d_inode(dentry))) {
+ pr_warn("Inconsistent encryption context (parent %llx:%llx child %llx:%llx)\n",
+ ceph_vinop(dir), ceph_vinop(d_inode(dentry)));
+ goto out_req;
+ }
+
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
struct inode *newino = d_inode(dentry);
@@ -870,6 +902,7 @@ retry:
}
out_req:
ceph_mdsc_put_request(req);
+ iput(new_inode);
out_ctx:
ceph_release_acl_sec_ctx(&as_ctx);
dout("atomic_open result=%d\n", err);
@@ -924,21 +957,24 @@ enum {
* If we get a short result from the OSD, check against i_size; we need to
* only return a short read to the caller if we hit EOF.
*/
-static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
- int *retry_op)
+ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
+ struct iov_iter *to, int *retry_op,
+ u64 *last_objver)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_osd_client *osdc = &fsc->client->osdc;
ssize_t ret;
- u64 off = iocb->ki_pos;
+ u64 off = *ki_pos;
u64 len = iov_iter_count(to);
u64 i_size = i_size_read(inode);
+ bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
+ u64 objver = 0;
- dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
- (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+ dout("sync_read on inode %p %llx~%llx\n", inode, *ki_pos, len);
+
+ if (ceph_inode_is_shutdown(inode))
+ return -EIO;
if (!len)
return 0;
@@ -962,10 +998,21 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
bool more;
int idx;
size_t left;
+ struct ceph_osd_req_op *op;
+ u64 read_off = off;
+ u64 read_len = len;
+
+ /* determine new offset/length if encrypted */
+ ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
+
+ dout("sync_read orig %llu~%llu reading %llu~%llu",
+ off, len, read_off, read_len);
req = ceph_osdc_new_request(osdc, &ci->i_layout,
- ci->i_vino, off, &len, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ ci->i_vino, read_off, &read_len, 0, 1,
+ sparse ? CEPH_OSD_OP_SPARSE_READ :
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
NULL, ci->i_truncate_seq,
ci->i_truncate_size, false);
if (IS_ERR(req)) {
@@ -973,10 +1020,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
break;
}
+ /* adjust len downward if the request truncated the len */
+ if (off + len > read_off + read_len)
+ len = read_off + read_len - off;
more = len < iov_iter_count(to);
- num_pages = calc_pages_for(off, len);
- page_off = off & ~PAGE_MASK;
+ num_pages = calc_pages_for(read_off, read_len);
+ page_off = offset_in_page(off);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
@@ -984,29 +1034,75 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
break;
}
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off,
+ osd_req_op_extent_osd_data_pages(req, 0, pages, read_len,
+ offset_in_page(read_off),
false, false);
+
+ op = &req->r_ops[0];
+ if (sparse) {
+ ret = ceph_alloc_sparse_ext_map(op);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ break;
+ }
+ }
+
ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
ceph_update_read_metrics(&fsc->mdsc->metric,
req->r_start_latency,
req->r_end_latency,
- len, ret);
+ read_len, ret);
- ceph_osdc_put_request(req);
+ if (ret > 0)
+ objver = req->r_version;
i_size = i_size_read(inode);
dout("sync_read %llu~%llu got %zd i_size %llu%s\n",
off, len, ret, i_size, (more ? " MORE" : ""));
- if (ret == -ENOENT)
+ /* Fix it to go to end of extent map */
+ if (sparse && ret >= 0)
+ ret = ceph_sparse_ext_map_end(op);
+ else if (ret == -ENOENT)
ret = 0;
+
+ if (ret > 0 && IS_ENCRYPTED(inode)) {
+ int fret;
+
+ fret = ceph_fscrypt_decrypt_extents(inode, pages,
+ read_off, op->extent.sparse_ext,
+ op->extent.sparse_ext_cnt);
+ if (fret < 0) {
+ ret = fret;
+ ceph_osdc_put_request(req);
+ break;
+ }
+
+ /* account for any partial block at the beginning */
+ fret -= (off - read_off);
+
+ /*
+ * Short read after big offset adjustment?
+ * Nothing is usable, just call it a zero
+ * len read.
+ */
+ fret = max(fret, 0);
+
+ /* account for partial block at the end */
+ ret = min_t(ssize_t, fret, len);
+ }
+
+ ceph_osdc_put_request(req);
+
+ /* Short read but not EOF? Zero out the remainder. */
if (ret >= 0 && ret < len && (off + ret < i_size)) {
int zlen = min(len - ret, i_size - off - ret);
int zoff = page_off + ret;
+
dout("sync_read zero gap %llu~%llu\n",
- off + ret, off + ret + zlen);
+ off + ret, off + ret + zlen);
ceph_zero_page_vector_range(zoff, zlen, pages);
ret += zlen;
}
@@ -1014,15 +1110,16 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
idx = 0;
left = ret > 0 ? ret : 0;
while (left > 0) {
- size_t len, copied;
- page_off = off & ~PAGE_MASK;
- len = min_t(size_t, left, PAGE_SIZE - page_off);
+ size_t plen, copied;
+
+ plen = min_t(size_t, left, PAGE_SIZE - page_off);
SetPageUptodate(pages[idx]);
copied = copy_page_to_iter(pages[idx++],
- page_off, len, to);
+ page_off, plen, to);
off += copied;
left -= copied;
- if (copied < len) {
+ page_off = 0;
+ if (copied < plen) {
ret = -EFAULT;
break;
}
@@ -1039,21 +1136,37 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
break;
}
- if (off > iocb->ki_pos) {
- if (off >= i_size) {
- *retry_op = CHECK_EOF;
- ret = i_size - iocb->ki_pos;
- iocb->ki_pos = i_size;
- } else {
- ret = off - iocb->ki_pos;
- iocb->ki_pos = off;
+ if (ret > 0) {
+ if (off > *ki_pos) {
+ if (off >= i_size) {
+ *retry_op = CHECK_EOF;
+ ret = i_size - *ki_pos;
+ *ki_pos = i_size;
+ } else {
+ ret = off - *ki_pos;
+ *ki_pos = off;
+ }
}
- }
+ if (last_objver)
+ *last_objver = objver;
+ }
dout("sync_read result %zd retry_op %d\n", ret, *retry_op);
return ret;
}
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
+ int *retry_op)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+
+ dout("sync_read on file %p %llx~%zx %s\n", file, iocb->ki_pos,
+ iov_iter_count(to), (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+
+ return __ceph_sync_read(inode, &iocb->ki_pos, to, retry_op, NULL);
+}
+
struct ceph_aio_request {
struct kiocb *iocb;
size_t total_len;
@@ -1125,8 +1238,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
struct inode *inode = req->r_inode;
struct ceph_aio_request *aio_req = req->r_priv;
struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+ struct ceph_osd_req_op *op = &req->r_ops[0];
struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric;
unsigned int len = osd_data->bvec_pos.iter.bi_size;
+ bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
BUG_ON(!osd_data->num_bvecs);
@@ -1147,6 +1262,8 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
rc = -ENOMEM;
} else if (!aio_req->write) {
+ if (sparse && rc >= 0)
+ rc = ceph_sparse_ext_map_end(op);
if (rc == -ENOENT)
rc = 0;
if (rc >= 0 && len > rc) {
@@ -1283,6 +1400,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE;
bool should_dirty = !write && user_backed_iter(iter);
+ bool sparse = ceph_test_mount_opt(fsc, SPARSEREAD);
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
@@ -1310,6 +1428,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
while (iov_iter_count(iter) > 0) {
u64 size = iov_iter_count(iter);
ssize_t len;
+ struct ceph_osd_req_op *op;
+ int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
if (write)
size = min_t(u64, size, fsc->mount_options->wsize);
@@ -1320,8 +1440,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &size, 0,
1,
- write ? CEPH_OSD_OP_WRITE :
- CEPH_OSD_OP_READ,
+ write ? CEPH_OSD_OP_WRITE : readop,
flags, snapc,
ci->i_truncate_seq,
ci->i_truncate_size,
@@ -1372,6 +1491,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
}
osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
+ op = &req->r_ops[0];
+ if (sparse) {
+ ret = ceph_alloc_sparse_ext_map(op);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ break;
+ }
+ }
if (aio_req) {
aio_req->total_len += len;
@@ -1399,8 +1526,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
size = i_size_read(inode);
if (!write) {
- if (ret == -ENOENT)
+ if (sparse && ret >= 0)
+ ret = ceph_sparse_ext_map_end(op);
+ else if (ret == -ENOENT)
ret = 0;
+
if (ret >= 0 && ret < len && pos + ret < size) {
struct iov_iter i;
int zlen = min_t(size_t, len - ret,
@@ -1481,13 +1611,12 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
- struct ceph_vino vino;
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_osd_request *req;
struct page **pages;
u64 len;
int num_pages;
int written = 0;
- int flags;
int ret;
bool check_caps = false;
struct timespec64 mtime = current_time(inode);
@@ -1505,79 +1634,350 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
return ret;
ceph_fscache_invalidate(inode, false);
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- pos >> PAGE_SHIFT,
- (pos + count - 1) >> PAGE_SHIFT);
- if (ret < 0)
- dout("invalidate_inode_pages2_range returned %d\n", ret);
-
- flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
while ((len = iov_iter_count(from)) > 0) {
size_t left;
int n;
-
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len, 0, 1,
- CEPH_OSD_OP_WRITE, flags, snapc,
- ci->i_truncate_seq,
- ci->i_truncate_size,
- false);
- if (IS_ERR(req)) {
- ret = PTR_ERR(req);
- break;
- }
+ u64 write_pos = pos;
+ u64 write_len = len;
+ u64 objnum, objoff;
+ u32 xlen;
+ u64 assert_ver = 0;
+ bool rmw;
+ bool first, last;
+ struct iov_iter saved_iter = *from;
+ size_t off;
+
+ ceph_fscrypt_adjust_off_and_len(inode, &write_pos, &write_len);
+
+ /* clamp the length to the end of first object */
+ ceph_calc_file_object_mapping(&ci->i_layout, write_pos,
+ write_len, &objnum, &objoff,
+ &xlen);
+ write_len = xlen;
+
+ /* adjust len downward if it goes beyond current object */
+ if (pos + len > write_pos + write_len)
+ len = write_pos + write_len - pos;
/*
- * write from beginning of first page,
- * regardless of io alignment
+ * If we had to adjust the length or position to align with a
+ * crypto block, then we must do a read/modify/write cycle. We
+ * use a version assertion to redrive the thing if something
+ * changes in between.
*/
- num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ first = pos != write_pos;
+ last = (pos + len) != (write_pos + write_len);
+ rmw = first || last;
+
+ dout("sync_write ino %llx %lld~%llu adjusted %lld~%llu -- %srmw\n",
+ ci->i_vino.ino, pos, len, write_pos, write_len,
+ rmw ? "" : "no ");
+ /*
+ * The data is emplaced into the page as it would be if it were
+ * in an array of pagecache pages.
+ */
+ num_pages = calc_pages_for(write_pos, write_len);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
- goto out;
+ break;
+ }
+
+ /* Do we need to preload the pages? */
+ if (rmw) {
+ u64 first_pos = write_pos;
+ u64 last_pos = (write_pos + write_len) - CEPH_FSCRYPT_BLOCK_SIZE;
+ u64 read_len = CEPH_FSCRYPT_BLOCK_SIZE;
+ struct ceph_osd_req_op *op;
+
+ /* We should only need to do this for encrypted inodes */
+ WARN_ON_ONCE(!IS_ENCRYPTED(inode));
+
+ /* No need to do two reads if first and last blocks are same */
+ if (first && last_pos == first_pos)
+ last = false;
+
+ /*
+ * Allocate a read request for one or two extents,
+ * depending on how the request was aligned.
+ */
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, first ? first_pos : last_pos,
+ &read_len, 0, (first && last) ? 2 : 1,
+ CEPH_OSD_OP_SPARSE_READ, CEPH_OSD_FLAG_READ,
+ NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ceph_release_page_vector(pages, num_pages);
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ /* Something is misaligned! */
+ if (read_len != CEPH_FSCRYPT_BLOCK_SIZE) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ ret = -EIO;
+ break;
+ }
+
+ /* Add extent for first block? */
+ op = &req->r_ops[0];
+
+ if (first) {
+ osd_req_op_extent_osd_data_pages(req, 0, pages,
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(first_pos),
+ false, false);
+ /* We only expect a single extent here */
+ ret = __ceph_alloc_sparse_ext_map(op, 1);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+
+ /* Add extent for last block */
+ if (last) {
+ /* Init the other extent if first extent has been used */
+ if (first) {
+ op = &req->r_ops[1];
+ osd_req_op_extent_init(req, 1,
+ CEPH_OSD_OP_SPARSE_READ,
+ last_pos, CEPH_FSCRYPT_BLOCK_SIZE,
+ ci->i_truncate_size,
+ ci->i_truncate_seq);
+ }
+
+ ret = __ceph_alloc_sparse_ext_map(op, 1);
+ if (ret) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+
+ osd_req_op_extent_osd_data_pages(req, first ? 1 : 0,
+ &pages[num_pages - 1],
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(last_pos),
+ false, false);
+ }
+
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
+
+ /* FIXME: length field is wrong if there are 2 extents */
+ ceph_update_read_metrics(&fsc->mdsc->metric,
+ req->r_start_latency,
+ req->r_end_latency,
+ read_len, ret);
+
+ /* Ok if object is not already present */
+ if (ret == -ENOENT) {
+ /*
+ * If there is no object, then we can't assert
+ * on its version. Set it to 0, and we'll use an
+ * exclusive create instead.
+ */
+ ceph_osdc_put_request(req);
+ ret = 0;
+
+ /*
+ * zero out the soon-to-be uncopied parts of the
+ * first and last pages.
+ */
+ if (first)
+ zero_user_segment(pages[0], 0,
+ offset_in_page(first_pos));
+ if (last)
+ zero_user_segment(pages[num_pages - 1],
+ offset_in_page(last_pos),
+ PAGE_SIZE);
+ } else {
+ if (ret < 0) {
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+
+ op = &req->r_ops[0];
+ if (op->extent.sparse_ext_cnt == 0) {
+ if (first)
+ zero_user_segment(pages[0], 0,
+ offset_in_page(first_pos));
+ else
+ zero_user_segment(pages[num_pages - 1],
+ offset_in_page(last_pos),
+ PAGE_SIZE);
+ } else if (op->extent.sparse_ext_cnt != 1 ||
+ ceph_sparse_ext_map_end(op) !=
+ CEPH_FSCRYPT_BLOCK_SIZE) {
+ ret = -EIO;
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+
+ if (first && last) {
+ op = &req->r_ops[1];
+ if (op->extent.sparse_ext_cnt == 0) {
+ zero_user_segment(pages[num_pages - 1],
+ offset_in_page(last_pos),
+ PAGE_SIZE);
+ } else if (op->extent.sparse_ext_cnt != 1 ||
+ ceph_sparse_ext_map_end(op) !=
+ CEPH_FSCRYPT_BLOCK_SIZE) {
+ ret = -EIO;
+ ceph_osdc_put_request(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+
+ /* Grab assert version. It must be non-zero. */
+ assert_ver = req->r_version;
+ WARN_ON_ONCE(ret > 0 && assert_ver == 0);
+
+ ceph_osdc_put_request(req);
+ if (first) {
+ ret = ceph_fscrypt_decrypt_block_inplace(inode,
+ pages[0], CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(first_pos),
+ first_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+ if (ret < 0) {
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+ if (last) {
+ ret = ceph_fscrypt_decrypt_block_inplace(inode,
+ pages[num_pages - 1],
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ offset_in_page(last_pos),
+ last_pos >> CEPH_FSCRYPT_BLOCK_SHIFT);
+ if (ret < 0) {
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
+ }
}
left = len;
+ off = offset_in_page(pos);
for (n = 0; n < num_pages; n++) {
- size_t plen = min_t(size_t, left, PAGE_SIZE);
- ret = copy_page_from_iter(pages[n], 0, plen, from);
+ size_t plen = min_t(size_t, left, PAGE_SIZE - off);
+
+ /* copy the data */
+ ret = copy_page_from_iter(pages[n], off, plen, from);
if (ret != plen) {
ret = -EFAULT;
break;
}
+ off = 0;
left -= ret;
}
-
if (ret < 0) {
+ dout("sync_write write failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
- goto out;
+ break;
}
- req->r_inode = inode;
+ if (IS_ENCRYPTED(inode)) {
+ ret = ceph_fscrypt_encrypt_pages(inode, pages,
+ write_pos, write_len,
+ GFP_KERNEL);
+ if (ret < 0) {
+ dout("encryption failed with %d\n", ret);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ }
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
- false, true);
+ req = ceph_osdc_new_request(osdc, &ci->i_layout,
+ ci->i_vino, write_pos, &write_len,
+ rmw ? 1 : 0, rmw ? 2 : 1,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE,
+ snapc, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ ceph_release_page_vector(pages, num_pages);
+ break;
+ }
+ dout("sync_write write op %lld~%llu\n", write_pos, write_len);
+ osd_req_op_extent_osd_data_pages(req, rmw ? 1 : 0, pages, write_len,
+ offset_in_page(write_pos), false,
+ true);
+ req->r_inode = inode;
req->r_mtime = mtime;
- ceph_osdc_start_request(&fsc->client->osdc, req);
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ /* Set up the assertion */
+ if (rmw) {
+ /*
+ * Set up the assertion. If we don't have a version
+ * number, then the object doesn't exist yet. Use an
+ * exclusive create instead of a version assertion in
+ * that case.
+ */
+ if (assert_ver) {
+ osd_req_op_init(req, 0, CEPH_OSD_OP_ASSERT_VER, 0);
+ req->r_ops[0].assert_ver.ver = assert_ver;
+ } else {
+ osd_req_op_init(req, 0, CEPH_OSD_OP_CREATE,
+ CEPH_OSD_OP_FLAG_EXCL);
+ }
+ }
+
+ ceph_osdc_start_request(osdc, req);
+ ret = ceph_osdc_wait_request(osdc, req);
ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
req->r_end_latency, len, ret);
-out:
ceph_osdc_put_request(req);
if (ret != 0) {
+ dout("sync_write osd write returned %d\n", ret);
+ /* Version changed! Must re-do the rmw cycle */
+ if ((assert_ver && (ret == -ERANGE || ret == -EOVERFLOW)) ||
+ (!assert_ver && ret == -EEXIST)) {
+ /* We should only ever see this on a rmw */
+ WARN_ON_ONCE(!rmw);
+
+ /* The version should never go backward */
+ WARN_ON_ONCE(ret == -EOVERFLOW);
+
+ *from = saved_iter;
+
+ /* FIXME: limit number of times we loop? */
+ continue;
+ }
ceph_set_error_write(ci);
break;
}
ceph_clear_error_write(ci);
+
+ /*
+ * We successfully wrote to a range of the file. Declare
+ * that region of the pagecache invalid.
+ */
+ ret = invalidate_inode_pages2_range(
+ inode->i_mapping,
+ pos >> PAGE_SHIFT,
+ (pos + len - 1) >> PAGE_SHIFT);
+ if (ret < 0) {
+ dout("invalidate_inode_pages2_range returned %d\n",
+ ret);
+ ret = 0;
+ }
pos += len;
written += len;
+ dout("sync_write written %d\n", written);
if (pos > i_size_read(inode)) {
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
@@ -1591,6 +1991,7 @@ out:
ret = written;
iocb->ki_pos = pos;
}
+ dout("sync_write returning %d\n", ret);
return ret;
}
@@ -1648,7 +2049,9 @@ again:
ceph_cap_string(got));
if (!ceph_has_inline_data(ci)) {
- if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+ if (!retry_op &&
+ (iocb->ki_flags & IOCB_DIRECT) &&
+ !IS_ENCRYPTED(inode)) {
ret = ceph_direct_read_write(iocb, to,
NULL, NULL);
if (ret >= 0 && ret < len)
@@ -1934,7 +2337,7 @@ retry_snap:
/* we might need to revert back to that point */
data = *from;
- if (iocb->ki_flags & IOCB_DIRECT)
+ if ((iocb->ki_flags & IOCB_DIRECT) && !IS_ENCRYPTED(inode))
written = ceph_direct_read_write(iocb, &data, snapc,
&prealloc_cf);
else
@@ -2165,6 +2568,9 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
+ if (IS_ENCRYPTED(inode))
+ return -EOPNOTSUPP;
+
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
@@ -2486,6 +2892,10 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
return -EOPNOTSUPP;
}
+ /* Every encrypted inode gets its own key, so we can't offload them */
+ if (IS_ENCRYPTED(src_inode) || IS_ENCRYPTED(dst_inode))
+ return -EOPNOTSUPP;
+
if (len < src_ci->i_layout.object_size)
return -EOPNOTSUPP; /* no remote copy will be done */
@@ -2559,7 +2969,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags);
/* Abort on short copies or on error */
- if (ret < src_objlen) {
+ if (ret < (long)src_objlen) {
dout("Failed partial copy (%zd)\n", ret);
goto out;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8e5f41d45283..b79100f720b3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -14,10 +14,12 @@
#include <linux/random.h>
#include <linux/sort.h>
#include <linux/iversion.h>
+#include <linux/fscrypt.h>
#include "super.h"
#include "mds_client.h"
#include "cache.h"
+#include "crypto.h"
#include <linux/ceph/decode.h>
/*
@@ -33,6 +35,7 @@
*/
static const struct inode_operations ceph_symlink_iops;
+static const struct inode_operations ceph_encrypted_symlink_iops;
static void ceph_inode_work(struct work_struct *work);
@@ -52,17 +55,99 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
return 0;
}
-struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
+/**
+ * ceph_new_inode - allocate a new inode in advance of an expected create
+ * @dir: parent directory for new inode
+ * @dentry: dentry that may eventually point to new inode
+ * @mode: mode of new inode
+ * @as_ctx: pointer to inherited security context
+ *
+ * Allocate a new inode in advance of an operation to create a new inode.
+ * This allocates the inode and sets up the acl_sec_ctx with appropriate
+ * info for the new inode.
+ *
+ * Returns a pointer to the new inode or an ERR_PTR.
+ */
+struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
+ umode_t *mode, struct ceph_acl_sec_ctx *as_ctx)
+{
+ int err;
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!S_ISLNK(*mode)) {
+ err = ceph_pre_init_acls(dir, mode, as_ctx);
+ if (err < 0)
+ goto out_err;
+ }
+
+ inode->i_state = 0;
+ inode->i_mode = *mode;
+
+ err = ceph_security_init_secctx(dentry, *mode, as_ctx);
+ if (err < 0)
+ goto out_err;
+
+ /*
+ * We'll skip setting fscrypt context for snapshots, leaving that for
+ * the handle_reply().
+ */
+ if (ceph_snap(dir) != CEPH_SNAPDIR) {
+ err = ceph_fscrypt_prepare_context(dir, inode, as_ctx);
+ if (err)
+ goto out_err;
+ }
+
+ return inode;
+out_err:
+ iput(inode);
+ return ERR_PTR(err);
+}
+
+void ceph_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx)
+{
+ if (as_ctx->pagelist) {
+ req->r_pagelist = as_ctx->pagelist;
+ as_ctx->pagelist = NULL;
+ }
+ ceph_fscrypt_as_ctx_to_req(req, as_ctx);
+}
+
+/**
+ * ceph_get_inode - find or create/hash a new inode
+ * @sb: superblock to search and allocate in
+ * @vino: vino to search for
+ * @newino: optional new inode to insert if one isn't found (may be NULL)
+ *
+ * Search for or insert a new inode into the hash for the given vino, and
+ * return a reference to it. If new is non-NULL, its reference is consumed.
+ */
+struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino,
+ struct inode *newino)
{
struct inode *inode;
if (ceph_vino_is_reserved(vino))
return ERR_PTR(-EREMOTEIO);
- inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
- ceph_set_ino_cb, &vino);
- if (!inode)
+ if (newino) {
+ inode = inode_insert5(newino, (unsigned long)vino.ino,
+ ceph_ino_compare, ceph_set_ino_cb, &vino);
+ if (inode != newino)
+ iput(newino);
+ } else {
+ inode = iget5_locked(sb, (unsigned long)vino.ino,
+ ceph_ino_compare, ceph_set_ino_cb, &vino);
+ }
+
+ if (!inode) {
+ dout("No inode found for %llx.%llx\n", vino.ino, vino.snap);
return ERR_PTR(-ENOMEM);
+ }
dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
@@ -78,8 +163,9 @@ struct inode *ceph_get_snapdir(struct inode *parent)
.ino = ceph_ino(parent),
.snap = CEPH_SNAPDIR,
};
- struct inode *inode = ceph_get_inode(parent->i_sb, vino);
+ struct inode *inode = ceph_get_inode(parent->i_sb, vino, NULL);
struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret = -ENOTDIR;
if (IS_ERR(inode))
return inode;
@@ -100,11 +186,29 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
inode->i_mtime = parent->i_mtime;
- inode->i_ctime = parent->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(parent));
inode->i_atime = parent->i_atime;
ci->i_rbytes = 0;
ci->i_btime = ceph_inode(parent)->i_btime;
+#ifdef CONFIG_FS_ENCRYPTION
+ /* if encrypted, just borrow fscrypt_auth from parent */
+ if (IS_ENCRYPTED(parent)) {
+ struct ceph_inode_info *pci = ceph_inode(parent);
+
+ ci->fscrypt_auth = kmemdup(pci->fscrypt_auth,
+ pci->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (ci->fscrypt_auth) {
+ inode->i_flags |= S_ENCRYPTED;
+ ci->fscrypt_auth_len = pci->fscrypt_auth_len;
+ } else {
+ dout("Failed to alloc snapdir fscrypt_auth\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+ }
+#endif
if (inode->i_state & I_NEW) {
inode->i_op = &ceph_snapdir_iops;
inode->i_fop = &ceph_snapdir_fops;
@@ -118,7 +222,7 @@ err:
discard_new_inode(inode);
else
iput(inode);
- return ERR_PTR(-ENOTDIR);
+ return ERR_PTR(ret);
}
const struct inode_operations ceph_file_iops = {
@@ -517,6 +621,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_truncate_seq = 0;
ci->i_truncate_size = 0;
ci->i_truncate_pending = 0;
+ ci->i_truncate_pagecache_size = 0;
ci->i_max_size = 0;
ci->i_reported_size = 0;
@@ -547,6 +652,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_WORK(&ci->i_work, ceph_inode_work);
ci->i_work_mask = 0;
memset(&ci->i_btime, '\0', sizeof(ci->i_btime));
+#ifdef CONFIG_FS_ENCRYPTION
+ ci->fscrypt_auth = NULL;
+ ci->fscrypt_auth_len = 0;
+#endif
return &ci->netfs.inode;
}
@@ -555,6 +664,10 @@ void ceph_free_inode(struct inode *inode)
struct ceph_inode_info *ci = ceph_inode(inode);
kfree(ci->i_symlink);
+#ifdef CONFIG_FS_ENCRYPTION
+ kfree(ci->fscrypt_auth);
+#endif
+ fscrypt_free_inode(inode);
kmem_cache_free(ceph_inode_cachep, ci);
}
@@ -575,6 +688,7 @@ void ceph_evict_inode(struct inode *inode)
clear_inode(inode);
ceph_fscache_unregister_inode_cookie(ci);
+ fscrypt_put_encryption_info(inode);
__ceph_remove_caps(ci);
@@ -650,14 +764,12 @@ int ceph_fill_file_size(struct inode *inode, int issued,
ceph_fscache_update(inode);
ci->i_reported_size = size;
if (truncate_seq != ci->i_truncate_seq) {
- dout("truncate_seq %u -> %u\n",
+ dout("%s truncate_seq %u -> %u\n", __func__,
ci->i_truncate_seq, truncate_seq);
ci->i_truncate_seq = truncate_seq;
/* the MDS should have revoked these caps */
- WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
- CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR |
+ WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_LAZYIO));
/*
* If we hold relevant caps, or in the case where we're
@@ -674,11 +786,26 @@ int ceph_fill_file_size(struct inode *inode, int issued,
}
}
}
- if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
- ci->i_truncate_size != truncate_size) {
- dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
- truncate_size);
+
+ /*
+ * It's possible that the new sizes of the two consecutive
+ * size truncations will be in the same fscrypt last block,
+ * and we need to truncate the corresponding page caches
+ * anyway.
+ */
+ if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0) {
+ dout("%s truncate_size %lld -> %llu, encrypted %d\n", __func__,
+ ci->i_truncate_size, truncate_size, !!IS_ENCRYPTED(inode));
+
ci->i_truncate_size = truncate_size;
+
+ if (IS_ENCRYPTED(inode)) {
+ dout("%s truncate_pagecache_size %lld -> %llu\n",
+ __func__, ci->i_truncate_pagecache_size, size);
+ ci->i_truncate_pagecache_size = size;
+ } else {
+ ci->i_truncate_pagecache_size = truncate_size;
+ }
}
return queue_trunc;
}
@@ -688,6 +815,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
struct timespec64 *mtime, struct timespec64 *atime)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct timespec64 ictime = inode_get_ctime(inode);
int warn = 0;
if (issued & (CEPH_CAP_FILE_EXCL|
@@ -696,11 +824,11 @@ void ceph_fill_file_time(struct inode *inode, int issued,
CEPH_CAP_AUTH_EXCL|
CEPH_CAP_XATTR_EXCL)) {
if (ci->i_version == 0 ||
- timespec64_compare(ctime, &inode->i_ctime) > 0) {
+ timespec64_compare(ctime, &ictime) > 0) {
dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n",
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ ictime.tv_sec, ictime.tv_nsec,
ctime->tv_sec, ctime->tv_nsec);
- inode->i_ctime = *ctime;
+ inode_set_ctime_to_ts(inode, *ctime);
}
if (ci->i_version == 0 ||
ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
@@ -738,7 +866,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
} else {
/* we have no write|excl caps; whatever the MDS says is true */
if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
- inode->i_ctime = *ctime;
+ inode_set_ctime_to_ts(inode, *ctime);
inode->i_mtime = *mtime;
inode->i_atime = *atime;
ci->i_time_warp_seq = time_warp_seq;
@@ -751,6 +879,34 @@ void ceph_fill_file_time(struct inode *inode, int issued,
inode, time_warp_seq, ci->i_time_warp_seq);
}
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static int decode_encrypted_symlink(const char *encsym, int enclen, u8 **decsym)
+{
+ int declen;
+ u8 *sym;
+
+ sym = kmalloc(enclen + 1, GFP_NOFS);
+ if (!sym)
+ return -ENOMEM;
+
+ declen = ceph_base64_decode(encsym, enclen, sym);
+ if (declen < 0) {
+ pr_err("%s: can't decode symlink (%d). Content: %.*s\n",
+ __func__, declen, enclen, encsym);
+ kfree(sym);
+ return -EIO;
+ }
+ sym[declen + 1] = '\0';
+ *decsym = sym;
+ return declen;
+}
+#else
+static int decode_encrypted_symlink(const char *encsym, int symlen, u8 **decsym)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
/*
* Populate an inode based on info from mds. May be called on new or
* existing inodes.
@@ -856,15 +1012,20 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
issued |= __ceph_caps_dirty(ci);
new_issued = ~issued & info_caps;
- /* directories have fl_stripe_unit set to zero */
- if (le32_to_cpu(info->layout.fl_stripe_unit))
- inode->i_blkbits =
- fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
- else
- inode->i_blkbits = CEPH_BLOCK_SHIFT;
-
__ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
+#ifdef CONFIG_FS_ENCRYPTION
+ if (iinfo->fscrypt_auth_len &&
+ ((inode->i_state & I_NEW) || (ci->fscrypt_auth_len == 0))) {
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth_len = iinfo->fscrypt_auth_len;
+ ci->fscrypt_auth = iinfo->fscrypt_auth;
+ iinfo->fscrypt_auth = NULL;
+ iinfo->fscrypt_auth_len = 0;
+ inode_set_flags(inode, S_ENCRYPTED, S_ENCRYPTED);
+ }
+#endif
+
if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
(issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = mode;
@@ -877,6 +1038,15 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
ceph_decode_timespec64(&ci->i_snap_btime, &iinfo->snap_btime);
}
+ /* directories have fl_stripe_unit set to zero */
+ if (IS_ENCRYPTED(inode))
+ inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+ else if (le32_to_cpu(info->layout.fl_stripe_unit))
+ inode->i_blkbits =
+ fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ else
+ inode->i_blkbits = CEPH_BLOCK_SHIFT;
+
if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
(issued & CEPH_CAP_LINK_EXCL) == 0)
set_nlink(inode, le32_to_cpu(info->nlink));
@@ -898,6 +1068,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ u64 size = le64_to_cpu(info->size);
s64 old_pool = ci->i_layout.pool_id;
struct ceph_string *old_ns;
@@ -911,10 +1082,22 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
pool_ns = old_ns;
+ if (IS_ENCRYPTED(inode) && size &&
+ iinfo->fscrypt_file_len == sizeof(__le64)) {
+ u64 fsize = __le64_to_cpu(*(__le64 *)iinfo->fscrypt_file);
+
+ if (size == round_up(fsize, CEPH_FSCRYPT_BLOCK_SIZE)) {
+ size = fsize;
+ } else {
+ pr_warn("fscrypt size mismatch: size=%llu fscrypt_file=%llu, discarding fscrypt_file size.\n",
+ info->size, size);
+ }
+ }
+
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
le64_to_cpu(info->truncate_size),
- le64_to_cpu(info->size));
+ size);
/* only update max_size on auth cap */
if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
ci->i_max_size != le64_to_cpu(info->max_size)) {
@@ -974,26 +1157,42 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
inode->i_fop = &ceph_file_fops;
break;
case S_IFLNK:
- inode->i_op = &ceph_symlink_iops;
if (!ci->i_symlink) {
u32 symlen = iinfo->symlink_len;
char *sym;
spin_unlock(&ci->i_ceph_lock);
- if (symlen != i_size_read(inode)) {
- pr_err("%s %llx.%llx BAD symlink "
- "size %lld\n", __func__,
- ceph_vinop(inode),
- i_size_read(inode));
+ if (IS_ENCRYPTED(inode)) {
+ if (symlen != i_size_read(inode))
+ pr_err("%s %llx.%llx BAD symlink size %lld\n",
+ __func__, ceph_vinop(inode),
+ i_size_read(inode));
+
+ err = decode_encrypted_symlink(iinfo->symlink,
+ symlen, (u8 **)&sym);
+ if (err < 0) {
+ pr_err("%s decoding encrypted symlink failed: %d\n",
+ __func__, err);
+ goto out;
+ }
+ symlen = err;
i_size_write(inode, symlen);
inode->i_blocks = calc_inode_blocks(symlen);
- }
+ } else {
+ if (symlen != i_size_read(inode)) {
+ pr_err("%s %llx.%llx BAD symlink size %lld\n",
+ __func__, ceph_vinop(inode),
+ i_size_read(inode));
+ i_size_write(inode, symlen);
+ inode->i_blocks = calc_inode_blocks(symlen);
+ }
- err = -ENOMEM;
- sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
- if (!sym)
- goto out;
+ err = -ENOMEM;
+ sym = kstrndup(iinfo->symlink, symlen, GFP_NOFS);
+ if (!sym)
+ goto out;
+ }
spin_lock(&ci->i_ceph_lock);
if (!ci->i_symlink)
@@ -1001,7 +1200,17 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page,
else
kfree(sym); /* lost a race */
}
- inode->i_link = ci->i_symlink;
+
+ if (IS_ENCRYPTED(inode)) {
+ /*
+ * Encrypted symlinks need to be decrypted before we can
+ * cache their targets in i_link. Don't touch it here.
+ */
+ inode->i_op = &ceph_encrypted_symlink_iops;
+ } else {
+ inode->i_link = ci->i_symlink;
+ inode->i_op = &ceph_symlink_iops;
+ }
break;
case S_IFDIR:
inode->i_op = &ceph_dir_iops;
@@ -1309,8 +1518,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
+ bool is_nokey = false;
struct qstr dname;
struct dentry *dn, *parent;
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+ struct ceph_fname fname = { .dir = dir,
+ .name = rinfo->dname,
+ .ctext = rinfo->altname,
+ .name_len = rinfo->dname_len,
+ .ctext_len = rinfo->altname_len };
BUG_ON(!rinfo->head->is_target);
BUG_ON(req->r_dentry);
@@ -1318,8 +1534,20 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
parent = d_find_any_alias(dir);
BUG_ON(!parent);
- dname.name = rinfo->dname;
- dname.len = rinfo->dname_len;
+ err = ceph_fname_alloc_buffer(dir, &oname);
+ if (err < 0) {
+ dput(parent);
+ goto done;
+ }
+
+ err = ceph_fname_to_usr(&fname, NULL, &oname, &is_nokey);
+ if (err < 0) {
+ dput(parent);
+ ceph_fname_free_buffer(dir, &oname);
+ goto done;
+ }
+ dname.name = oname.name;
+ dname.len = oname.len;
dname.hash = full_name_hash(parent, dname.name, dname.len);
tvino.ino = le64_to_cpu(rinfo->targeti.in->ino);
tvino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
@@ -1334,9 +1562,15 @@ retry_lookup:
dname.len, dname.name, dn);
if (!dn) {
dput(parent);
+ ceph_fname_free_buffer(dir, &oname);
err = -ENOMEM;
goto done;
}
+ if (is_nokey) {
+ spin_lock(&dn->d_lock);
+ dn->d_flags |= DCACHE_NOKEY_NAME;
+ spin_unlock(&dn->d_lock);
+ }
err = 0;
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != tvino.ino ||
@@ -1348,6 +1582,7 @@ retry_lookup:
dput(dn);
goto retry_lookup;
}
+ ceph_fname_free_buffer(dir, &oname);
req->r_dentry = dn;
dput(parent);
@@ -1551,7 +1786,7 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
vino.ino = le64_to_cpu(rde->inode.in->ino);
vino.snap = le64_to_cpu(rde->inode.in->snapid);
- in = ceph_get_inode(req->r_dentry->d_sb, vino);
+ in = ceph_get_inode(req->r_dentry->d_sb, vino, NULL);
if (IS_ERR(in)) {
err = PTR_ERR(in);
dout("new_inode badness got %d\n", err);
@@ -1629,7 +1864,8 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session)
{
struct dentry *parent = req->r_dentry;
- struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
+ struct inode *inode = d_inode(parent);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct qstr dname;
struct dentry *dn;
@@ -1703,9 +1939,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
tvino.snap = le64_to_cpu(rde->inode.in->snapid);
if (rinfo->hash_order) {
- u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
- rde->name, rde->name_len);
- hash = ceph_frag_value(hash);
+ u32 hash = ceph_frag_value(rde->raw_hash);
if (hash != last_hash)
fpos_offset = 2;
last_hash = hash;
@@ -1728,6 +1962,11 @@ retry_lookup:
err = -ENOMEM;
goto out;
}
+ if (rde->is_nokey) {
+ spin_lock(&dn->d_lock);
+ dn->d_flags |= DCACHE_NOKEY_NAME;
+ spin_unlock(&dn->d_lock);
+ }
} else if (d_really_is_positive(dn) &&
(ceph_ino(d_inode(dn)) != tvino.ino ||
ceph_snap(d_inode(dn)) != tvino.snap)) {
@@ -1753,7 +1992,7 @@ retry_lookup:
if (d_really_is_positive(dn)) {
in = d_inode(dn);
} else {
- in = ceph_get_inode(parent->d_sb, tvino);
+ in = ceph_get_inode(parent->d_sb, tvino, NULL);
if (IS_ERR(in)) {
dout("new_inode badness\n");
d_drop(dn);
@@ -1926,7 +2165,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
retry:
spin_lock(&ci->i_ceph_lock);
if (ci->i_truncate_pending == 0) {
- dout("__do_pending_vmtruncate %p none pending\n", inode);
+ dout("%s %p none pending\n", __func__, inode);
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
return;
@@ -1938,8 +2177,7 @@ retry:
*/
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
spin_unlock(&ci->i_ceph_lock);
- dout("__do_pending_vmtruncate %p flushing snaps first\n",
- inode);
+ dout("%s %p flushing snaps first\n", __func__, inode);
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
@@ -1948,9 +2186,9 @@ retry:
/* there should be no reader or writer */
WARN_ON_ONCE(ci->i_rd_ref || ci->i_wr_ref);
- to = ci->i_truncate_size;
+ to = ci->i_truncate_pagecache_size;
wrbuffer_refs = ci->i_wrbuffer_ref;
- dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
+ dout("%s %p (%d) to %lld\n", __func__, inode,
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
@@ -1958,7 +2196,7 @@ retry:
truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
- if (to == ci->i_truncate_size) {
+ if (to == ci->i_truncate_pagecache_size) {
ci->i_truncate_pending = 0;
finish = 1;
}
@@ -1999,6 +2237,32 @@ static void ceph_inode_work(struct work_struct *work)
iput(inode);
}
+static const char *ceph_encrypted_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
+ return fscrypt_get_symlink(inode, ci->i_symlink, i_size_read(inode),
+ done);
+}
+
+static int ceph_encrypted_symlink_getattr(struct mnt_idmap *idmap,
+ const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ int ret;
+
+ ret = ceph_getattr(idmap, path, stat, request_mask, query_flags);
+ if (ret)
+ return ret;
+ return fscrypt_symlink_getattr(path, stat);
+}
+
/*
* symlinks
*/
@@ -2009,20 +2273,173 @@ static const struct inode_operations ceph_symlink_iops = {
.listxattr = ceph_listxattr,
};
-int __ceph_setattr(struct inode *inode, struct iattr *attr)
+static const struct inode_operations ceph_encrypted_symlink_iops = {
+ .get_link = ceph_encrypted_get_link,
+ .setattr = ceph_setattr,
+ .getattr = ceph_encrypted_symlink_getattr,
+ .listxattr = ceph_listxattr,
+};
+
+/*
+ * Transfer the encrypted last block to the MDS and the MDS
+ * will help update it when truncating a smaller size.
+ *
+ * We don't support a PAGE_SIZE that is smaller than the
+ * CEPH_FSCRYPT_BLOCK_SIZE.
+ */
+static int fill_fscrypt_truncate(struct inode *inode,
+ struct ceph_mds_request *req,
+ struct iattr *attr)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int boff = attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE;
+ loff_t pos, orig_pos = round_down(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE);
+ u64 block = orig_pos >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ struct ceph_pagelist *pagelist = NULL;
+ struct kvec iov = {0};
+ struct iov_iter iter;
+ struct page *page = NULL;
+ struct ceph_fscrypt_truncate_size_header header;
+ int retry_op = 0;
+ int len = CEPH_FSCRYPT_BLOCK_SIZE;
+ loff_t i_size = i_size_read(inode);
+ int got, ret, issued;
+ u64 objver;
+
+ ret = __ceph_get_caps(inode, NULL, CEPH_CAP_FILE_RD, 0, -1, &got);
+ if (ret < 0)
+ return ret;
+
+ issued = __ceph_caps_issued(ci, NULL);
+
+ dout("%s size %lld -> %lld got cap refs on %s, issued %s\n", __func__,
+ i_size, attr->ia_size, ceph_cap_string(got),
+ ceph_cap_string(issued));
+
+ /* Try to writeback the dirty pagecaches */
+ if (issued & (CEPH_CAP_FILE_BUFFER)) {
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+
+ ret = filemap_write_and_wait_range(inode->i_mapping,
+ orig_pos, lend);
+ if (ret < 0)
+ goto out;
+ }
+
+ page = __page_cache_alloc(GFP_KERNEL);
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pagelist = ceph_pagelist_alloc(GFP_KERNEL);
+ if (!pagelist) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ iov.iov_base = kmap_local_page(page);
+ iov.iov_len = len;
+ iov_iter_kvec(&iter, READ, &iov, 1, len);
+
+ pos = orig_pos;
+ ret = __ceph_sync_read(inode, &pos, &iter, &retry_op, &objver);
+ if (ret < 0)
+ goto out;
+
+ /* Insert the header first */
+ header.ver = 1;
+ header.compat = 1;
+ header.change_attr = cpu_to_le64(inode_peek_iversion_raw(inode));
+
+ /*
+ * Always set the block_size to CEPH_FSCRYPT_BLOCK_SIZE,
+ * because in MDS it may need this to do the truncate.
+ */
+ header.block_size = cpu_to_le32(CEPH_FSCRYPT_BLOCK_SIZE);
+
+ /*
+ * If we hit a hole here, we should just skip filling
+ * the fscrypt for the request, because once the fscrypt
+ * is enabled, the file will be split into many blocks
+ * with the size of CEPH_FSCRYPT_BLOCK_SIZE, if there
+ * has a hole, the hole size should be multiple of block
+ * size.
+ *
+ * If the Rados object doesn't exist, it will be set to 0.
+ */
+ if (!objver) {
+ dout("%s hit hole, ppos %lld < size %lld\n", __func__,
+ pos, i_size);
+
+ header.data_len = cpu_to_le32(8 + 8 + 4);
+ header.file_offset = 0;
+ ret = 0;
+ } else {
+ header.data_len = cpu_to_le32(8 + 8 + 4 + CEPH_FSCRYPT_BLOCK_SIZE);
+ header.file_offset = cpu_to_le64(orig_pos);
+
+ dout("%s encrypt block boff/bsize %d/%lu\n", __func__,
+ boff, CEPH_FSCRYPT_BLOCK_SIZE);
+
+ /* truncate and zero out the extra contents for the last block */
+ memset(iov.iov_base + boff, 0, PAGE_SIZE - boff);
+
+ /* encrypt the last block */
+ ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
+ CEPH_FSCRYPT_BLOCK_SIZE,
+ 0, block,
+ GFP_KERNEL);
+ if (ret)
+ goto out;
+ }
+
+ /* Insert the header */
+ ret = ceph_pagelist_append(pagelist, &header, sizeof(header));
+ if (ret)
+ goto out;
+
+ if (header.block_size) {
+ /* Append the last block contents to pagelist */
+ ret = ceph_pagelist_append(pagelist, iov.iov_base,
+ CEPH_FSCRYPT_BLOCK_SIZE);
+ if (ret)
+ goto out;
+ }
+ req->r_pagelist = pagelist;
+out:
+ dout("%s %p size dropping cap refs on %s\n", __func__,
+ inode, ceph_cap_string(got));
+ ceph_put_cap_refs(ci, got);
+ if (iov.iov_base)
+ kunmap_local(iov.iov_base);
+ if (page)
+ __free_pages(page, 0);
+ if (ret && pagelist)
+ ceph_pagelist_release(pagelist);
+ return ret;
+}
+
+int __ceph_setattr(struct inode *inode, struct iattr *attr,
+ struct ceph_iattr *cia)
{
struct ceph_inode_info *ci = ceph_inode(inode);
unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_cap_flush *prealloc_cf;
+ loff_t isize = i_size_read(inode);
int issued;
int release = 0, dirtied = 0;
int mask = 0;
int err = 0;
int inode_dirty_flags = 0;
bool lock_snap_rwsem = false;
+ bool fill_fscrypt;
+ int truncate_retry = 20; /* The RMW will take around 50ms */
+retry:
prealloc_cf = ceph_alloc_cap_flush();
if (!prealloc_cf)
return -ENOMEM;
@@ -2034,6 +2451,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
return PTR_ERR(req);
}
+ fill_fscrypt = false;
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
@@ -2049,6 +2467,43 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ if (cia && cia->fscrypt_auth) {
+ u32 len = ceph_fscrypt_auth_len(cia->fscrypt_auth);
+
+ if (len > sizeof(*cia->fscrypt_auth)) {
+ err = -EINVAL;
+ spin_unlock(&ci->i_ceph_lock);
+ goto out;
+ }
+
+ dout("setattr %llx:%llx fscrypt_auth len %u to %u)\n",
+ ceph_vinop(inode), ci->fscrypt_auth_len, len);
+
+ /* It should never be re-set once set */
+ WARN_ON_ONCE(ci->fscrypt_auth);
+
+ if (issued & CEPH_CAP_AUTH_EXCL) {
+ dirtied |= CEPH_CAP_AUTH_EXCL;
+ kfree(ci->fscrypt_auth);
+ ci->fscrypt_auth = (u8 *)cia->fscrypt_auth;
+ ci->fscrypt_auth_len = len;
+ } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
+ ci->fscrypt_auth_len != len ||
+ memcmp(ci->fscrypt_auth, cia->fscrypt_auth, len)) {
+ req->r_fscrypt_auth = cia->fscrypt_auth;
+ mask |= CEPH_SETATTR_FSCRYPT_AUTH;
+ release |= CEPH_CAP_AUTH_SHARED;
+ }
+ cia->fscrypt_auth = NULL;
+ }
+#else
+ if (cia && cia->fscrypt_auth) {
+ err = -EINVAL;
+ spin_unlock(&ci->i_ceph_lock);
+ goto out;
+ }
+#endif /* CONFIG_FS_ENCRYPTION */
if (ia_valid & ATTR_UID) {
dout("setattr %p uid %d -> %d\n", inode,
@@ -2118,10 +2573,27 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
}
if (ia_valid & ATTR_SIZE) {
- loff_t isize = i_size_read(inode);
-
dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size);
- if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
+ /*
+ * Only when the new size is smaller and not aligned to
+ * CEPH_FSCRYPT_BLOCK_SIZE will the RMW is needed.
+ */
+ if (IS_ENCRYPTED(inode) && attr->ia_size < isize &&
+ (attr->ia_size % CEPH_FSCRYPT_BLOCK_SIZE)) {
+ mask |= CEPH_SETATTR_SIZE;
+ release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ mask |= CEPH_SETATTR_FSCRYPT_FILE;
+ req->r_args.setattr.size =
+ cpu_to_le64(round_up(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_args.setattr.old_size =
+ cpu_to_le64(round_up(isize,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_fscrypt_file = attr->ia_size;
+ fill_fscrypt = true;
+ } else if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) {
if (attr->ia_size > isize) {
i_size_write(inode, attr->ia_size);
inode->i_blocks = calc_inode_blocks(attr->ia_size);
@@ -2131,11 +2603,24 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
}
} else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
attr->ia_size != isize) {
- req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
- req->r_args.setattr.old_size = cpu_to_le64(isize);
mask |= CEPH_SETATTR_SIZE;
release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
+ if (IS_ENCRYPTED(inode) && attr->ia_size) {
+ set_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags);
+ mask |= CEPH_SETATTR_FSCRYPT_FILE;
+ req->r_args.setattr.size =
+ cpu_to_le64(round_up(attr->ia_size,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_args.setattr.old_size =
+ cpu_to_le64(round_up(isize,
+ CEPH_FSCRYPT_BLOCK_SIZE));
+ req->r_fscrypt_file = attr->ia_size;
+ } else {
+ req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
+ req->r_args.setattr.old_size = cpu_to_le64(isize);
+ req->r_fscrypt_file = 0;
+ }
}
}
if (ia_valid & ATTR_MTIME) {
@@ -2166,7 +2651,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode,
- inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
+ inode_get_ctime(inode).tv_sec,
+ inode_get_ctime(inode).tv_nsec,
attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
only ? "ctime only" : "ignored");
if (only) {
@@ -2191,14 +2677,16 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf);
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
inode_inc_iversion_raw(inode);
}
release &= issued;
spin_unlock(&ci->i_ceph_lock);
- if (lock_snap_rwsem)
+ if (lock_snap_rwsem) {
up_read(&mdsc->snap_rwsem);
+ lock_snap_rwsem = false;
+ }
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
@@ -2210,8 +2698,29 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
req->r_stamp = attr->ia_ctime;
+ if (fill_fscrypt) {
+ err = fill_fscrypt_truncate(inode, req, attr);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * The truncate request will return -EAGAIN when the
+ * last block has been updated just before the MDS
+ * successfully gets the xlock for the FILE lock. To
+ * avoid corrupting the file contents we need to retry
+ * it.
+ */
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err == -EAGAIN && truncate_retry--) {
+ dout("setattr %p result=%d (%s locally, %d remote), retry it!\n",
+ inode, err, ceph_cap_string(dirtied), mask);
+ ceph_mdsc_put_request(req);
+ ceph_free_cap_flush(prealloc_cf);
+ goto retry;
+ }
}
+out:
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
ceph_cap_string(dirtied), mask);
@@ -2240,6 +2749,10 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (ceph_inode_is_shutdown(inode))
return -ESTALE;
+ err = fscrypt_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
if (err != 0)
return err;
@@ -2252,7 +2765,7 @@ int ceph_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size))
return -EDQUOT;
- err = __ceph_setattr(inode, attr);
+ err = __ceph_setattr(inode, attr, NULL);
if (err >= 0 && (attr->ia_valid & ATTR_MODE))
err = posix_acl_chmod(&nop_mnt_idmap, dentry, attr->ia_mode);
@@ -2465,7 +2978,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
return err;
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->ino = ceph_present_inode(inode);
/*
@@ -2523,8 +3036,12 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->nlink = 1 + 1 + ci->i_subdirs;
}
- stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
+ if (IS_ENCRYPTED(inode))
+ stat->attributes |= STATX_ATTR_ENCRYPTED;
+ stat->attributes_mask |= (STATX_ATTR_CHANGE_MONOTONIC |
+ STATX_ATTR_ENCRYPTED);
+
stat->result_mask = request_mask & valid_mask;
return err;
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index deac817647eb..91a84917d203 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -6,6 +6,7 @@
#include "mds_client.h"
#include "ioctl.h"
#include <linux/ceph/striper.h>
+#include <linux/fscrypt.h>
/*
* ioctls
@@ -268,9 +269,96 @@ static long ceph_ioctl_syncio(struct file *file)
return 0;
}
+static int vet_mds_for_fscrypt(struct file *file)
+{
+ int i, ret = -EOPNOTSUPP;
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(file_inode(file)->i_sb);
+
+ mutex_lock(&mdsc->mutex);
+ for (i = 0; i < mdsc->max_sessions; i++) {
+ struct ceph_mds_session *s = mdsc->sessions[i];
+
+ if (!s)
+ continue;
+ if (test_bit(CEPHFS_FEATURE_ALTERNATE_NAME, &s->s_features))
+ ret = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ return ret;
+}
+
+static long ceph_set_encryption_policy(struct file *file, unsigned long arg)
+{
+ int ret, got = 0;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ /* encrypted directories can't have striped layout */
+ if (ci->i_layout.stripe_count > 1)
+ return -EINVAL;
+
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+
+ /*
+ * Ensure we hold these caps so that we _know_ that the rstats check
+ * in the empty_dir check is reliable.
+ */
+ ret = ceph_get_caps(file, CEPH_CAP_FILE_SHARED, 0, -1, &got);
+ if (ret)
+ return ret;
+
+ ret = fscrypt_ioctl_set_policy(file, (const void __user *)arg);
+ if (got)
+ ceph_put_cap_refs(ci, got);
+
+ return ret;
+}
+
+static const char *ceph_ioctl_cmd_name(const unsigned int cmd)
+{
+ switch (cmd) {
+ case CEPH_IOC_GET_LAYOUT:
+ return "get_layout";
+ case CEPH_IOC_SET_LAYOUT:
+ return "set_layout";
+ case CEPH_IOC_SET_LAYOUT_POLICY:
+ return "set_layout_policy";
+ case CEPH_IOC_GET_DATALOC:
+ return "get_dataloc";
+ case CEPH_IOC_LAZYIO:
+ return "lazyio";
+ case CEPH_IOC_SYNCIO:
+ return "syncio";
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ return "set_encryption_policy";
+ case FS_IOC_GET_ENCRYPTION_POLICY:
+ return "get_encryption_policy";
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ return "get_encryption_policy_ex";
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ return "add_encryption_key";
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ return "remove_encryption_key";
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ return "remove_encryption_key_all_users";
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ return "get_encryption_key_status";
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ return "get_encryption_nonce";
+ default:
+ return "unknown";
+ }
+}
+
long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+ int ret;
+
+ dout("ioctl file %p cmd %s arg %lu\n", file,
+ ceph_ioctl_cmd_name(cmd), arg);
switch (cmd) {
case CEPH_IOC_GET_LAYOUT:
return ceph_ioctl_get_layout(file, (void __user *)arg);
@@ -289,6 +377,43 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case CEPH_IOC_SYNCIO:
return ceph_ioctl_syncio(file);
+
+ case FS_IOC_SET_ENCRYPTION_POLICY:
+ return ceph_set_encryption_policy(file, arg);
+
+ case FS_IOC_GET_ENCRYPTION_POLICY:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_get_policy(file, (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_POLICY_EX:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_get_policy_ex(file, (void __user *)arg);
+
+ case FS_IOC_ADD_ENCRYPTION_KEY:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_add_key(file, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY:
+ return fscrypt_ioctl_remove_key(file, (void __user *)arg);
+
+ case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS:
+ return fscrypt_ioctl_remove_key_all_users(file,
+ (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_KEY_STATUS:
+ return fscrypt_ioctl_get_key_status(file, (void __user *)arg);
+
+ case FS_IOC_GET_ENCRYPTION_NONCE:
+ ret = vet_mds_for_fscrypt(file);
+ if (ret)
+ return ret;
+ return fscrypt_ioctl_get_nonce(file, (void __user *)arg);
}
return -ENOTTY;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 5fb367b1d4b0..615db141b6c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -15,6 +15,7 @@
#include "super.h"
#include "mds_client.h"
+#include "crypto.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/messenger.h>
@@ -184,8 +185,54 @@ static int parse_reply_info_in(void **p, void *end,
info->rsnaps = 0;
}
+ if (struct_v >= 5) {
+ u32 alen;
+
+ ceph_decode_32_safe(p, end, alen, bad);
+
+ while (alen--) {
+ u32 len;
+
+ /* key */
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_skip_n(p, end, len, bad);
+ /* value */
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_skip_n(p, end, len, bad);
+ }
+ }
+
+ /* fscrypt flag -- ignore */
+ if (struct_v >= 6)
+ ceph_decode_skip_8(p, end, bad);
+
+ info->fscrypt_auth = NULL;
+ info->fscrypt_auth_len = 0;
+ info->fscrypt_file = NULL;
+ info->fscrypt_file_len = 0;
+ if (struct_v >= 7) {
+ ceph_decode_32_safe(p, end, info->fscrypt_auth_len, bad);
+ if (info->fscrypt_auth_len) {
+ info->fscrypt_auth = kmalloc(info->fscrypt_auth_len,
+ GFP_KERNEL);
+ if (!info->fscrypt_auth)
+ return -ENOMEM;
+ ceph_decode_copy_safe(p, end, info->fscrypt_auth,
+ info->fscrypt_auth_len, bad);
+ }
+ ceph_decode_32_safe(p, end, info->fscrypt_file_len, bad);
+ if (info->fscrypt_file_len) {
+ info->fscrypt_file = kmalloc(info->fscrypt_file_len,
+ GFP_KERNEL);
+ if (!info->fscrypt_file)
+ return -ENOMEM;
+ ceph_decode_copy_safe(p, end, info->fscrypt_file,
+ info->fscrypt_file_len, bad);
+ }
+ }
*p = end;
} else {
+ /* legacy (unversioned) struct */
if (features & CEPH_FEATURE_MDS_INLINE_DATA) {
ceph_decode_64_safe(p, end, info->inline_version, bad);
ceph_decode_32_safe(p, end, info->inline_len, bad);
@@ -263,27 +310,47 @@ bad:
static int parse_reply_info_lease(void **p, void *end,
struct ceph_mds_reply_lease **lease,
- u64 features)
+ u64 features, u32 *altname_len, u8 **altname)
{
+ u8 struct_v;
+ u32 struct_len;
+ void *lend;
+
if (features == (u64)-1) {
- u8 struct_v, struct_compat;
- u32 struct_len;
+ u8 struct_compat;
+
ceph_decode_8_safe(p, end, struct_v, bad);
ceph_decode_8_safe(p, end, struct_compat, bad);
+
/* struct_v is expected to be >= 1. we only understand
* encoding whose struct_compat == 1. */
if (!struct_v || struct_compat != 1)
goto bad;
+
ceph_decode_32_safe(p, end, struct_len, bad);
- ceph_decode_need(p, end, struct_len, bad);
- end = *p + struct_len;
+ } else {
+ struct_len = sizeof(**lease);
+ *altname_len = 0;
+ *altname = NULL;
}
- ceph_decode_need(p, end, sizeof(**lease), bad);
+ lend = *p + struct_len;
+ ceph_decode_need(p, end, struct_len, bad);
*lease = *p;
*p += sizeof(**lease);
- if (features == (u64)-1)
- *p = end;
+
+ if (features == (u64)-1) {
+ if (struct_v >= 2) {
+ ceph_decode_32_safe(p, end, *altname_len, bad);
+ ceph_decode_need(p, end, *altname_len, bad);
+ *altname = *p;
+ *p += *altname_len;
+ } else {
+ *altname = NULL;
+ *altname_len = 0;
+ }
+ }
+ *p = lend;
return 0;
bad:
return -EIO;
@@ -313,7 +380,8 @@ static int parse_reply_info_trace(void **p, void *end,
info->dname = *p;
*p += info->dname_len;
- err = parse_reply_info_lease(p, end, &info->dlease, features);
+ err = parse_reply_info_lease(p, end, &info->dlease, features,
+ &info->altname_len, &info->altname);
if (err < 0)
goto out_bad;
}
@@ -339,9 +407,10 @@ out_bad:
* parse readdir results
*/
static int parse_reply_info_readdir(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ struct ceph_mds_request *req,
+ u64 features)
{
+ struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
u32 num, i = 0;
int err;
@@ -371,18 +440,87 @@ static int parse_reply_info_readdir(void **p, void *end,
info->dir_nr = num;
while (num) {
+ struct inode *inode = d_inode(req->r_dentry);
+ struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
+ struct fscrypt_str tname = FSTR_INIT(NULL, 0);
+ struct fscrypt_str oname = FSTR_INIT(NULL, 0);
+ struct ceph_fname fname;
+ u32 altname_len, _name_len;
+ u8 *altname, *_name;
+
/* dentry */
- ceph_decode_32_safe(p, end, rde->name_len, bad);
- ceph_decode_need(p, end, rde->name_len, bad);
- rde->name = *p;
- *p += rde->name_len;
- dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name);
+ ceph_decode_32_safe(p, end, _name_len, bad);
+ ceph_decode_need(p, end, _name_len, bad);
+ _name = *p;
+ *p += _name_len;
+ dout("parsed dir dname '%.*s'\n", _name_len, _name);
+
+ if (info->hash_order)
+ rde->raw_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
+ _name, _name_len);
/* dentry lease */
- err = parse_reply_info_lease(p, end, &rde->lease, features);
+ err = parse_reply_info_lease(p, end, &rde->lease, features,
+ &altname_len, &altname);
if (err)
goto out_bad;
+
+ /*
+ * Try to dencrypt the dentry names and update them
+ * in the ceph_mds_reply_dir_entry struct.
+ */
+ fname.dir = inode;
+ fname.name = _name;
+ fname.name_len = _name_len;
+ fname.ctext = altname;
+ fname.ctext_len = altname_len;
+ /*
+ * The _name_len maybe larger than altname_len, such as
+ * when the human readable name length is in range of
+ * (CEPH_NOHASH_NAME_MAX, CEPH_NOHASH_NAME_MAX + SHA256_DIGEST_SIZE),
+ * then the copy in ceph_fname_to_usr will corrupt the
+ * data if there has no encryption key.
+ *
+ * Just set the no_copy flag and then if there has no
+ * encryption key the oname.name will be assigned to
+ * _name always.
+ */
+ fname.no_copy = true;
+ if (altname_len == 0) {
+ /*
+ * Set tname to _name, and this will be used
+ * to do the base64_decode in-place. It's
+ * safe because the decoded string should
+ * always be shorter, which is 3/4 of origin
+ * string.
+ */
+ tname.name = _name;
+
+ /*
+ * Set oname to _name too, and this will be
+ * used to do the dencryption in-place.
+ */
+ oname.name = _name;
+ oname.len = _name_len;
+ } else {
+ /*
+ * This will do the decryption only in-place
+ * from altname cryptext directly.
+ */
+ oname.name = altname;
+ oname.len = altname_len;
+ }
+ rde->is_nokey = false;
+ err = ceph_fname_to_usr(&fname, &tname, &oname, &rde->is_nokey);
+ if (err) {
+ pr_err("%s unable to decode %.*s, got %d\n", __func__,
+ _name_len, _name, err);
+ goto out_bad;
+ }
+ rde->name = oname.name;
+ rde->name_len = oname.len;
+
/* inode */
err = parse_reply_info_in(p, end, &rde->inode, features);
if (err < 0)
@@ -581,15 +719,16 @@ bad:
* parse extra results
*/
static int parse_reply_info_extra(void **p, void *end,
- struct ceph_mds_reply_info_parsed *info,
+ struct ceph_mds_request *req,
u64 features, struct ceph_mds_session *s)
{
+ struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
u32 op = le32_to_cpu(info->head->op);
if (op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
- return parse_reply_info_readdir(p, end, info, features);
+ return parse_reply_info_readdir(p, end, req, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features, s);
else if (op == CEPH_MDS_OP_GETVXATTR)
@@ -602,9 +741,9 @@ static int parse_reply_info_extra(void **p, void *end,
* parse entire mds reply
*/
static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
- struct ceph_mds_reply_info_parsed *info,
- u64 features)
+ struct ceph_mds_request *req, u64 features)
{
+ struct ceph_mds_reply_info_parsed *info = &req->r_reply_info;
void *p, *end;
u32 len;
int err;
@@ -626,7 +765,7 @@ static int parse_reply_info(struct ceph_mds_session *s, struct ceph_msg *msg,
ceph_decode_32_safe(&p, end, len, bad);
if (len > 0) {
ceph_decode_need(&p, end, len, bad);
- err = parse_reply_info_extra(&p, p+len, info, features, s);
+ err = parse_reply_info_extra(&p, p+len, req, features, s);
if (err < 0)
goto out_bad;
}
@@ -651,8 +790,21 @@ out_bad:
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{
+ int i;
+
+ kfree(info->diri.fscrypt_auth);
+ kfree(info->diri.fscrypt_file);
+ kfree(info->targeti.fscrypt_auth);
+ kfree(info->targeti.fscrypt_file);
if (!info->dir_entries)
return;
+
+ for (i = 0; i < info->dir_nr; i++) {
+ struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i;
+
+ kfree(rde->inode.fscrypt_auth);
+ kfree(rde->inode.fscrypt_file);
+ }
free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size));
}
@@ -945,6 +1097,7 @@ void ceph_mdsc_release_request(struct kref *kref)
iput(req->r_parent);
}
iput(req->r_target_inode);
+ iput(req->r_new_inode);
if (req->r_dentry)
dput(req->r_dentry);
if (req->r_old_dentry)
@@ -965,6 +1118,8 @@ void ceph_mdsc_release_request(struct kref *kref)
put_cred(req->r_cred);
if (req->r_pagelist)
ceph_pagelist_release(req->r_pagelist);
+ kfree(req->r_fscrypt_auth);
+ kfree(req->r_altname);
put_request_session(req);
ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
WARN_ON_ONCE(!list_empty(&req->r_wait));
@@ -2373,20 +2528,90 @@ static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
return mdsc->oldest_tid;
}
-/*
- * Build a dentry's path. Allocate on heap; caller must kfree. Based
- * on build_path_from_dentry in fs/cifs/dir.c.
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
+{
+ struct inode *dir = req->r_parent;
+ struct dentry *dentry = req->r_dentry;
+ u8 *cryptbuf = NULL;
+ u32 len = 0;
+ int ret = 0;
+
+ /* only encode if we have parent and dentry */
+ if (!dir || !dentry)
+ goto success;
+
+ /* No-op unless this is encrypted */
+ if (!IS_ENCRYPTED(dir))
+ goto success;
+
+ ret = ceph_fscrypt_prepare_readdir(dir);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ /* No key? Just ignore it. */
+ if (!fscrypt_has_encryption_key(dir))
+ goto success;
+
+ if (!fscrypt_fname_encrypted_size(dir, dentry->d_name.len, NAME_MAX,
+ &len)) {
+ WARN_ON_ONCE(1);
+ return ERR_PTR(-ENAMETOOLONG);
+ }
+
+ /* No need to append altname if name is short enough */
+ if (len <= CEPH_NOHASH_NAME_MAX) {
+ len = 0;
+ goto success;
+ }
+
+ cryptbuf = kmalloc(len, GFP_KERNEL);
+ if (!cryptbuf)
+ return ERR_PTR(-ENOMEM);
+
+ ret = fscrypt_fname_encrypt(dir, &dentry->d_name, cryptbuf, len);
+ if (ret) {
+ kfree(cryptbuf);
+ return ERR_PTR(ret);
+ }
+success:
+ *plen = len;
+ return cryptbuf;
+}
+#else
+static u8 *get_fscrypt_altname(const struct ceph_mds_request *req, u32 *plen)
+{
+ *plen = 0;
+ return NULL;
+}
+#endif
+
+/**
+ * ceph_mdsc_build_path - build a path string to a given dentry
+ * @dentry: dentry to which path should be built
+ * @plen: returned length of string
+ * @pbase: returned base inode number
+ * @for_wire: is this path going to be sent to the MDS?
+ *
+ * Build a string that represents the path to the dentry. This is mostly called
+ * for two different purposes:
+ *
+ * 1) we need to build a path string to send to the MDS (for_wire == true)
+ * 2) we need a path string for local presentation (e.g. debugfs)
+ * (for_wire == false)
*
- * If @stop_on_nosnap, generate path relative to the first non-snapped
- * inode.
+ * The path is built in reverse, starting with the dentry. Walk back up toward
+ * the root, building the path until the first non-snapped inode is reached
+ * (for_wire) or the root inode is reached (!for_wire).
*
* Encode hidden .snap dirs as a double /, i.e.
* foo/.snap/bar -> foo//bar
*/
char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *pbase,
- int stop_on_nosnap)
+ int for_wire)
{
- struct dentry *temp;
+ struct dentry *cur;
+ struct inode *inode;
char *path;
int pos;
unsigned seq;
@@ -2403,34 +2628,72 @@ retry:
path[pos] = '\0';
seq = read_seqbegin(&rename_lock);
- rcu_read_lock();
- temp = dentry;
+ cur = dget(dentry);
for (;;) {
- struct inode *inode;
+ struct dentry *parent;
- spin_lock(&temp->d_lock);
- inode = d_inode(temp);
+ spin_lock(&cur->d_lock);
+ inode = d_inode(cur);
if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
dout("build_path path+%d: %p SNAPDIR\n",
- pos, temp);
- } else if (stop_on_nosnap && inode && dentry != temp &&
+ pos, cur);
+ spin_unlock(&cur->d_lock);
+ parent = dget_parent(cur);
+ } else if (for_wire && inode && dentry != cur &&
ceph_snap(inode) == CEPH_NOSNAP) {
- spin_unlock(&temp->d_lock);
+ spin_unlock(&cur->d_lock);
pos++; /* get rid of any prepended '/' */
break;
+ } else if (!for_wire || !IS_ENCRYPTED(d_inode(cur->d_parent))) {
+ pos -= cur->d_name.len;
+ if (pos < 0) {
+ spin_unlock(&cur->d_lock);
+ break;
+ }
+ memcpy(path + pos, cur->d_name.name, cur->d_name.len);
+ spin_unlock(&cur->d_lock);
+ parent = dget_parent(cur);
} else {
- pos -= temp->d_name.len;
+ int len, ret;
+ char buf[NAME_MAX];
+
+ /*
+ * Proactively copy name into buf, in case we need to
+ * present it as-is.
+ */
+ memcpy(buf, cur->d_name.name, cur->d_name.len);
+ len = cur->d_name.len;
+ spin_unlock(&cur->d_lock);
+ parent = dget_parent(cur);
+
+ ret = ceph_fscrypt_prepare_readdir(d_inode(parent));
+ if (ret < 0) {
+ dput(parent);
+ dput(cur);
+ return ERR_PTR(ret);
+ }
+
+ if (fscrypt_has_encryption_key(d_inode(parent))) {
+ len = ceph_encode_encrypted_fname(d_inode(parent),
+ cur, buf);
+ if (len < 0) {
+ dput(parent);
+ dput(cur);
+ return ERR_PTR(len);
+ }
+ }
+ pos -= len;
if (pos < 0) {
- spin_unlock(&temp->d_lock);
+ dput(parent);
break;
}
- memcpy(path + pos, temp->d_name.name, temp->d_name.len);
+ memcpy(path + pos, buf, len);
}
- spin_unlock(&temp->d_lock);
- temp = READ_ONCE(temp->d_parent);
+ dput(cur);
+ cur = parent;
/* Are we at the root? */
- if (IS_ROOT(temp))
+ if (IS_ROOT(cur))
break;
/* Are we out of buffer? */
@@ -2439,8 +2702,9 @@ retry:
path[pos] = '/';
}
- base = ceph_ino(d_inode(temp));
- rcu_read_unlock();
+ inode = d_inode(cur);
+ base = inode ? ceph_ino(inode) : 0;
+ dput(cur);
if (read_seqretry(&rename_lock, seq))
goto retry;
@@ -2450,8 +2714,8 @@ retry:
* A rename didn't occur, but somehow we didn't end up where
* we thought we would. Throw a warning and try again.
*/
- pr_warn("build_path did not end path lookup where "
- "expected, pos is %d\n", pos);
+ pr_warn("build_path did not end path lookup where expected (pos = %d)\n",
+ pos);
goto retry;
}
@@ -2471,7 +2735,8 @@ static int build_dentry_path(struct dentry *dentry, struct inode *dir,
rcu_read_lock();
if (!dir)
dir = d_inode_rcu(dentry->d_parent);
- if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP) {
+ if (dir && parent_locked && ceph_snap(dir) == CEPH_NOSNAP &&
+ !IS_ENCRYPTED(dir)) {
*pino = ceph_ino(dir);
rcu_read_unlock();
*ppath = dentry->d_name.name;
@@ -2539,8 +2804,8 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
return r;
}
-static void encode_timestamp_and_gids(void **p,
- const struct ceph_mds_request *req)
+static void encode_mclientrequest_tail(void **p,
+ const struct ceph_mds_request *req)
{
struct ceph_timespec ts;
int i;
@@ -2548,11 +2813,43 @@ static void encode_timestamp_and_gids(void **p,
ceph_encode_timespec64(&ts, &req->r_stamp);
ceph_encode_copy(p, &ts, sizeof(ts));
- /* gid_list */
+ /* v4: gid_list */
ceph_encode_32(p, req->r_cred->group_info->ngroups);
for (i = 0; i < req->r_cred->group_info->ngroups; i++)
ceph_encode_64(p, from_kgid(&init_user_ns,
req->r_cred->group_info->gid[i]));
+
+ /* v5: altname */
+ ceph_encode_32(p, req->r_altname_len);
+ ceph_encode_copy(p, req->r_altname, req->r_altname_len);
+
+ /* v6: fscrypt_auth and fscrypt_file */
+ if (req->r_fscrypt_auth) {
+ u32 authlen = ceph_fscrypt_auth_len(req->r_fscrypt_auth);
+
+ ceph_encode_32(p, authlen);
+ ceph_encode_copy(p, req->r_fscrypt_auth, authlen);
+ } else {
+ ceph_encode_32(p, 0);
+ }
+ if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags)) {
+ ceph_encode_32(p, sizeof(__le64));
+ ceph_encode_64(p, req->r_fscrypt_file);
+ } else {
+ ceph_encode_32(p, 0);
+ }
+}
+
+static struct ceph_mds_request_head_legacy *
+find_legacy_request_head(void *p, u64 features)
+{
+ bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
+ struct ceph_mds_request_head_old *ohead;
+
+ if (legacy)
+ return (struct ceph_mds_request_head_legacy *)p;
+ ohead = (struct ceph_mds_request_head_old *)p;
+ return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
}
/*
@@ -2565,7 +2862,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_msg *msg;
- struct ceph_mds_request_head_old *head;
+ struct ceph_mds_request_head_legacy *lhead;
const char *path1 = NULL;
const char *path2 = NULL;
u64 ino1 = 0, ino2 = 0;
@@ -2577,6 +2874,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
void *p, *end;
int ret;
bool legacy = !(session->s_con.peer_features & CEPH_FEATURE_FS_BTIME);
+ bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ &session->s_features);
ret = set_request_path_attr(req->r_inode, req->r_dentry,
req->r_parent, req->r_path1, req->r_ino1.ino,
@@ -2601,12 +2900,32 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
goto out_free1;
}
- len = legacy ? sizeof(*head) : sizeof(struct ceph_mds_request_head);
- len += pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
- sizeof(struct ceph_timespec);
- len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
+ req->r_altname = get_fscrypt_altname(req, &req->r_altname_len);
+ if (IS_ERR(req->r_altname)) {
+ msg = ERR_CAST(req->r_altname);
+ req->r_altname = NULL;
+ goto out_free2;
+ }
+
+ /*
+ * For old cephs without supporting the 32bit retry/fwd feature
+ * it will copy the raw memories directly when decoding the
+ * requests. While new cephs will decode the head depending the
+ * version member, so we need to make sure it will be compatible
+ * with them both.
+ */
+ if (legacy)
+ len = sizeof(struct ceph_mds_request_head_legacy);
+ else if (old_version)
+ len = sizeof(struct ceph_mds_request_head_old);
+ else
+ len = sizeof(struct ceph_mds_request_head);
- /* calculate (max) length for cap releases */
+ /* filepaths */
+ len += 2 * (1 + sizeof(u32) + sizeof(u64));
+ len += pathlen1 + pathlen2;
+
+ /* cap releases */
len += sizeof(struct ceph_mds_request_release) *
(!!req->r_inode_drop + !!req->r_dentry_drop +
!!req->r_old_inode_drop + !!req->r_old_dentry_drop);
@@ -2616,6 +2935,27 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (req->r_old_dentry_drop)
len += pathlen2;
+ /* MClientRequest tail */
+
+ /* req->r_stamp */
+ len += sizeof(struct ceph_timespec);
+
+ /* gid list */
+ len += sizeof(u32) + (sizeof(u64) * req->r_cred->group_info->ngroups);
+
+ /* alternate name */
+ len += sizeof(u32) + req->r_altname_len;
+
+ /* fscrypt_auth */
+ len += sizeof(u32); // fscrypt_auth
+ if (req->r_fscrypt_auth)
+ len += ceph_fscrypt_auth_len(req->r_fscrypt_auth);
+
+ /* fscrypt_file */
+ len += sizeof(u32);
+ if (test_bit(CEPH_MDS_R_FSCRYPT_FILE, &req->r_req_flags))
+ len += sizeof(__le64);
+
msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
@@ -2624,33 +2964,40 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.tid = cpu_to_le64(req->r_tid);
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
+
/*
- * The old ceph_mds_request_head didn't contain a version field, and
+ * The ceph_mds_request_head_legacy didn't contain a version field, and
* one was added when we moved the message version from 3->4.
*/
if (legacy) {
msg->hdr.version = cpu_to_le16(3);
- head = msg->front.iov_base;
- p = msg->front.iov_base + sizeof(*head);
- } else {
- struct ceph_mds_request_head *new_head = msg->front.iov_base;
+ p = msg->front.iov_base + sizeof(*lhead);
+ } else if (old_version) {
+ struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
msg->hdr.version = cpu_to_le16(4);
- new_head->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
- head = (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
- p = msg->front.iov_base + sizeof(*new_head);
+ ohead->version = cpu_to_le16(1);
+ p = msg->front.iov_base + sizeof(*ohead);
+ } else {
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
+
+ msg->hdr.version = cpu_to_le16(6);
+ nhead->version = cpu_to_le16(CEPH_MDS_REQUEST_HEAD_VERSION);
+ p = msg->front.iov_base + sizeof(*nhead);
}
end = msg->front.iov_base + msg->front.iov_len;
- head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
- head->op = cpu_to_le32(req->r_op);
- head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
- req->r_cred->fsuid));
- head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
- req->r_cred->fsgid));
- head->ino = cpu_to_le64(req->r_deleg_ino);
- head->args = req->r_args;
+ lhead->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
+ lhead->op = cpu_to_le32(req->r_op);
+ lhead->caller_uid = cpu_to_le32(from_kuid(&init_user_ns,
+ req->r_cred->fsuid));
+ lhead->caller_gid = cpu_to_le32(from_kgid(&init_user_ns,
+ req->r_cred->fsgid));
+ lhead->ino = cpu_to_le64(req->r_deleg_ino);
+ lhead->args = req->r_args;
ceph_encode_filepath(&p, end, ino1, path1);
ceph_encode_filepath(&p, end, ino2, path2);
@@ -2665,15 +3012,23 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
req->r_inode ? req->r_inode : d_inode(req->r_dentry),
mds, req->r_inode_drop, req->r_inode_unless,
req->r_op == CEPH_MDS_OP_READDIR);
- if (req->r_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_dentry,
+ if (req->r_dentry_drop) {
+ ret = ceph_encode_dentry_release(&p, req->r_dentry,
req->r_parent, mds, req->r_dentry_drop,
req->r_dentry_unless);
- if (req->r_old_dentry_drop)
- releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
+ if (ret < 0)
+ goto out_err;
+ releases += ret;
+ }
+ if (req->r_old_dentry_drop) {
+ ret = ceph_encode_dentry_release(&p, req->r_old_dentry,
req->r_old_dentry_dir, mds,
req->r_old_dentry_drop,
req->r_old_dentry_unless);
+ if (ret < 0)
+ goto out_err;
+ releases += ret;
+ }
if (req->r_old_inode_drop)
releases += ceph_encode_inode_release(&p,
d_inode(req->r_old_dentry),
@@ -2684,9 +3039,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
p = msg->front.iov_base + req->r_request_release_offset;
}
- head->num_releases = cpu_to_le16(releases);
+ lhead->num_releases = cpu_to_le16(releases);
- encode_timestamp_and_gids(&p, req);
+ encode_mclientrequest_tail(&p, req);
if (WARN_ON_ONCE(p > end)) {
ceph_msg_put(msg);
@@ -2715,6 +3070,10 @@ out_free1:
ceph_mdsc_free_path((char *)path1, pathlen1);
out:
return msg;
+out_err:
+ ceph_msg_put(msg);
+ msg = ERR_PTR(ret);
+ goto out_free2;
}
/*
@@ -2731,18 +3090,6 @@ static void complete_request(struct ceph_mds_client *mdsc,
complete_all(&req->r_completion);
}
-static struct ceph_mds_request_head_old *
-find_old_request_head(void *p, u64 features)
-{
- bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
- struct ceph_mds_request_head *new_head;
-
- if (legacy)
- return (struct ceph_mds_request_head_old *)p;
- new_head = (struct ceph_mds_request_head *)p;
- return (struct ceph_mds_request_head_old *)&new_head->oldest_client_tid;
-}
-
/*
* called under mdsc->mutex
*/
@@ -2752,29 +3099,28 @@ static int __prepare_send_request(struct ceph_mds_session *session,
{
int mds = session->s_mds;
struct ceph_mds_client *mdsc = session->s_mdsc;
- struct ceph_mds_request_head_old *rhead;
+ struct ceph_mds_request_head_legacy *lhead;
+ struct ceph_mds_request_head *nhead;
struct ceph_msg *msg;
- int flags = 0, max_retry;
+ int flags = 0, old_max_retry;
+ bool old_version = !test_bit(CEPHFS_FEATURE_32BITS_RETRY_FWD,
+ &session->s_features);
/*
- * The type of 'r_attempts' in kernel 'ceph_mds_request'
- * is 'int', while in 'ceph_mds_request_head' the type of
- * 'num_retry' is '__u8'. So in case the request retries
- * exceeding 256 times, the MDS will receive a incorrect
- * retry seq.
- *
- * In this case it's ususally a bug in MDS and continue
- * retrying the request makes no sense.
- *
- * In future this could be fixed in ceph code, so avoid
- * using the hardcode here.
+ * Avoid inifinite retrying after overflow. The client will
+ * increase the retry count and if the MDS is old version,
+ * so we limit to retry at most 256 times.
*/
- max_retry = sizeof_field(struct ceph_mds_request_head, num_retry);
- max_retry = 1 << (max_retry * BITS_PER_BYTE);
- if (req->r_attempts >= max_retry) {
- pr_warn_ratelimited("%s request tid %llu seq overflow\n",
- __func__, req->r_tid);
- return -EMULTIHOP;
+ if (req->r_attempts) {
+ old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
+ num_retry);
+ old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
+ if ((old_version && req->r_attempts >= old_max_retry) ||
+ ((uint32_t)req->r_attempts >= U32_MAX)) {
+ pr_warn_ratelimited("%s request tid %llu seq overflow\n",
+ __func__, req->r_tid);
+ return -EMULTIHOP;
+ }
}
req->r_attempts++;
@@ -2800,23 +3146,27 @@ static int __prepare_send_request(struct ceph_mds_session *session,
* d_move mangles the src name.
*/
msg = req->r_request;
- rhead = find_old_request_head(msg->front.iov_base,
- session->s_con.peer_features);
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
- flags = le32_to_cpu(rhead->flags);
+ flags = le32_to_cpu(lhead->flags);
flags |= CEPH_MDS_FLAG_REPLAY;
- rhead->flags = cpu_to_le32(flags);
+ lhead->flags = cpu_to_le32(flags);
if (req->r_target_inode)
- rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+ lhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
- rhead->num_retry = req->r_attempts - 1;
+ lhead->num_retry = req->r_attempts - 1;
+ if (!old_version) {
+ nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+ nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+ }
/* remove cap/dentry releases from message */
- rhead->num_releases = 0;
+ lhead->num_releases = 0;
p = msg->front.iov_base + req->r_request_release_offset;
- encode_timestamp_and_gids(&p, req);
+ encode_mclientrequest_tail(&p, req);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2834,18 +3184,23 @@ static int __prepare_send_request(struct ceph_mds_session *session,
}
req->r_request = msg;
- rhead = find_old_request_head(msg->front.iov_base,
- session->s_con.peer_features);
- rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
+ lhead = find_legacy_request_head(msg->front.iov_base,
+ session->s_con.peer_features);
+ lhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_REPLAY;
if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags))
flags |= CEPH_MDS_FLAG_ASYNC;
if (req->r_parent)
flags |= CEPH_MDS_FLAG_WANT_DENTRY;
- rhead->flags = cpu_to_le32(flags);
- rhead->num_fwd = req->r_num_fwd;
- rhead->num_retry = req->r_attempts - 1;
+ lhead->flags = cpu_to_le32(flags);
+ lhead->num_fwd = req->r_num_fwd;
+ lhead->num_retry = req->r_attempts - 1;
+ if (!old_version) {
+ nhead = (struct ceph_mds_request_head*)msg->front.iov_base;
+ nhead->ext_num_fwd = cpu_to_le32(req->r_num_fwd);
+ nhead->ext_num_retry = cpu_to_le32(req->r_attempts - 1);
+ }
dout(" r_parent = %p\n", req->r_parent);
return 0;
@@ -3348,22 +3703,35 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
}
dout("handle_reply tid %lld result %d\n", tid, result);
- rinfo = &req->r_reply_info;
if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features))
- err = parse_reply_info(session, msg, rinfo, (u64)-1);
+ err = parse_reply_info(session, msg, req, (u64)-1);
else
- err = parse_reply_info(session, msg, rinfo, session->s_con.peer_features);
+ err = parse_reply_info(session, msg, req,
+ session->s_con.peer_features);
mutex_unlock(&mdsc->mutex);
/* Must find target inode outside of mutexes to avoid deadlocks */
+ rinfo = &req->r_reply_info;
if ((err >= 0) && rinfo->head->is_target) {
- struct inode *in;
+ struct inode *in = xchg(&req->r_new_inode, NULL);
struct ceph_vino tvino = {
.ino = le64_to_cpu(rinfo->targeti.in->ino),
.snap = le64_to_cpu(rinfo->targeti.in->snapid)
};
- in = ceph_get_inode(mdsc->fsc->sb, tvino);
+ /*
+ * If we ended up opening an existing inode, discard
+ * r_new_inode
+ */
+ if (req->r_op == CEPH_MDS_OP_CREATE &&
+ !req->r_reply_info.has_create_ino) {
+ /* This should never happen on an async create */
+ WARN_ON_ONCE(req->r_deleg_ino);
+ iput(in);
+ in = NULL;
+ }
+
+ in = ceph_get_inode(mdsc->fsc->sb, tvino, in);
if (IS_ERR(in)) {
err = PTR_ERR(in);
mutex_lock(&session->s_mutex);
@@ -3406,7 +3774,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP))
- ceph_readdir_prepopulate(req, req->r_session);
+ err = ceph_readdir_prepopulate(req, req->r_session);
}
current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex);
@@ -3491,33 +3859,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
dout("forward tid %llu aborted, unregistering\n", tid);
__unregister_request(mdsc, req);
- } else if (fwd_seq <= req->r_num_fwd) {
+ } else if (fwd_seq <= req->r_num_fwd || (uint32_t)fwd_seq >= U32_MAX) {
/*
- * The type of 'num_fwd' in ceph 'MClientRequestForward'
- * is 'int32_t', while in 'ceph_mds_request_head' the
- * type is '__u8'. So in case the request bounces between
- * MDSes exceeding 256 times, the client will get stuck.
- *
- * In this case it's ususally a bug in MDS and continue
- * bouncing the request makes no sense.
+ * Avoid inifinite retrying after overflow.
*
- * In future this could be fixed in ceph code, so avoid
- * using the hardcode here.
+ * The MDS will increase the fwd count and in client side
+ * if the num_fwd is less than the one saved in request
+ * that means the MDS is an old version and overflowed of
+ * 8 bits.
*/
- int max = sizeof_field(struct ceph_mds_request_head, num_fwd);
- max = 1 << (max * BITS_PER_BYTE);
- if (req->r_num_fwd >= max) {
- mutex_lock(&req->r_fill_mutex);
- req->r_err = -EMULTIHOP;
- set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
- mutex_unlock(&req->r_fill_mutex);
- aborted = true;
- pr_warn_ratelimited("forward tid %llu seq overflow\n",
- tid);
- } else {
- dout("forward tid %llu to mds%d - old seq %d <= %d\n",
- tid, next_mds, req->r_num_fwd, fwd_seq);
- }
+ mutex_lock(&req->r_fill_mutex);
+ req->r_err = -EMULTIHOP;
+ set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
+ mutex_unlock(&req->r_fill_mutex);
+ aborted = true;
+ pr_warn_ratelimited("forward tid %llu seq overflow\n", tid);
} else {
/* resend. forward race not possible; mds would drop */
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
@@ -4550,6 +4906,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
dout("handle_lease from mds%d\n", mds);
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
+
/* decode */
if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
goto bad;
@@ -4568,8 +4927,6 @@ static void handle_lease(struct ceph_mds_client *mdsc,
dname.len, dname.name);
mutex_lock(&session->s_mutex);
- inc_session_sequence(session);
-
if (!inode) {
dout("handle_lease no inode %llx\n", vino.ino);
goto release;
@@ -4631,9 +4988,13 @@ release:
out:
mutex_unlock(&session->s_mutex);
iput(inode);
+
+ ceph_dec_mds_stopping_blocker(mdsc);
return;
bad:
+ ceph_dec_mds_stopping_blocker(mdsc);
+
pr_err("corrupt lease message\n");
ceph_msg_dump(msg);
}
@@ -4829,6 +5190,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
}
init_completion(&mdsc->safe_umount_waiters);
+ spin_lock_init(&mdsc->stopping_lock);
+ atomic_set(&mdsc->stopping_blockers, 0);
+ init_completion(&mdsc->stopping_waiter);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->quotarealms_inodes = RB_ROOT;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 86d2965e68a1..5a3714bdd64a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -32,8 +32,9 @@ enum ceph_feature_type {
CEPHFS_FEATURE_ALTERNATE_NAME,
CEPHFS_FEATURE_NOTIFY_SESSION_STATE,
CEPHFS_FEATURE_OP_GETVXATTR,
+ CEPHFS_FEATURE_32BITS_RETRY_FWD,
- CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_OP_GETVXATTR,
+ CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_32BITS_RETRY_FWD,
};
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
@@ -44,8 +45,10 @@ enum ceph_feature_type {
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
CEPHFS_FEATURE_METRIC_COLLECT, \
+ CEPHFS_FEATURE_ALTERNATE_NAME, \
CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \
CEPHFS_FEATURE_OP_GETVXATTR, \
+ CEPHFS_FEATURE_32BITS_RETRY_FWD, \
}
/*
@@ -86,13 +89,19 @@ struct ceph_mds_reply_info_in {
s32 dir_pin;
struct ceph_timespec btime;
struct ceph_timespec snap_btime;
+ u8 *fscrypt_auth;
+ u8 *fscrypt_file;
+ u32 fscrypt_auth_len;
+ u32 fscrypt_file_len;
u64 rsnaps;
u64 change_attr;
};
struct ceph_mds_reply_dir_entry {
+ bool is_nokey;
char *name;
u32 name_len;
+ u32 raw_hash;
struct ceph_mds_reply_lease *lease;
struct ceph_mds_reply_info_in inode;
loff_t offset;
@@ -116,7 +125,9 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_info_in diri, targeti;
struct ceph_mds_reply_dirfrag *dirfrag;
char *dname;
+ u8 *altname;
u32 dname_len;
+ u32 altname_len;
struct ceph_mds_reply_lease *dlease;
struct ceph_mds_reply_xattr xattr_info;
@@ -263,6 +274,7 @@ struct ceph_mds_request {
struct inode *r_parent; /* parent dir inode */
struct inode *r_target_inode; /* resulting inode */
+ struct inode *r_new_inode; /* new inode (for creates) */
#define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */
#define CEPH_MDS_R_ABORTED (2) /* call was aborted */
@@ -272,11 +284,19 @@ struct ceph_mds_request {
#define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */
#define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */
#define CEPH_MDS_R_ASYNC (8) /* async request */
+#define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */
unsigned long r_req_flags;
struct mutex r_fill_mutex;
union ceph_mds_request_args r_args;
+
+ struct ceph_fscrypt_auth *r_fscrypt_auth;
+ u64 r_fscrypt_file;
+
+ u8 *r_altname; /* fscrypt binary crypttext for long filenames */
+ u32 r_altname_len; /* length of r_altname */
+
int r_fmode; /* file mode, if expecting cap */
int r_request_release_offset;
const struct cred *r_cred;
@@ -381,8 +401,9 @@ struct cap_wait {
};
enum {
- CEPH_MDSC_STOPPING_BEGIN = 1,
- CEPH_MDSC_STOPPING_FLUSHED = 2,
+ CEPH_MDSC_STOPPING_BEGIN = 1,
+ CEPH_MDSC_STOPPING_FLUSHING = 2,
+ CEPH_MDSC_STOPPING_FLUSHED = 3,
};
/*
@@ -401,7 +422,11 @@ struct ceph_mds_client {
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
int max_sessions; /* len of sessions array */
- int stopping; /* true if shutting down */
+
+ spinlock_t stopping_lock; /* protect snap_empty */
+ int stopping; /* the stage of shutting down */
+ atomic_t stopping_blockers;
+ struct completion stopping_waiter;
atomic64_t quotarealms_count; /* # realms with quota */
/*
@@ -557,7 +582,7 @@ static inline void ceph_mdsc_free_path(char *path, int len)
}
extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
- int stop_on_nosnap);
+ int for_wire);
extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 64592adfe48f..f7fcf7f08ec6 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -47,25 +47,23 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
struct inode *inode;
struct ceph_inode_info *ci;
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
+
if (msg->front.iov_len < sizeof(*h)) {
pr_err("%s corrupt message mds%d len %d\n", __func__,
session->s_mds, (int)msg->front.iov_len);
ceph_msg_dump(msg);
- return;
+ goto out;
}
- /* increment msg sequence number */
- mutex_lock(&session->s_mutex);
- inc_session_sequence(session);
- mutex_unlock(&session->s_mutex);
-
/* lookup inode */
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
pr_warn("Failed to find inode %llu\n", vino.ino);
- return;
+ goto out;
}
ci = ceph_inode(inode);
@@ -78,6 +76,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
spin_unlock(&ci->i_ceph_lock);
iput(inode);
+out:
+ ceph_dec_mds_stopping_blocker(mdsc);
}
static struct ceph_quotarealm_inode *
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 343d738448dc..813f21add992 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -660,7 +660,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size = i_size_read(inode);
capsnap->mtime = inode->i_mtime;
capsnap->atime = inode->i_atime;
- capsnap->ctime = inode->i_ctime;
+ capsnap->ctime = inode_get_ctime(inode);
capsnap->btime = ci->i_btime;
capsnap->change_attr = inode_peek_iversion_raw(inode);
capsnap->time_warp_seq = ci->i_time_warp_seq;
@@ -1015,6 +1015,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
int locked_rwsem = 0;
bool close_sessions = false;
+ if (!ceph_inc_mds_stopping_blocker(mdsc, session))
+ return;
+
/* decode */
if (msg->front.iov_len < sizeof(*h))
goto bad;
@@ -1030,10 +1033,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
dout("%s from mds%d op %s split %llx tracelen %d\n", __func__,
mds, ceph_snap_op_name(op), split, trace_len);
- mutex_lock(&session->s_mutex);
- inc_session_sequence(session);
- mutex_unlock(&session->s_mutex);
-
down_write(&mdsc->snap_rwsem);
locked_rwsem = 1;
@@ -1151,6 +1150,7 @@ skip_inode:
up_write(&mdsc->snap_rwsem);
flush_snaps(mdsc);
+ ceph_dec_mds_stopping_blocker(mdsc);
return;
bad:
@@ -1160,6 +1160,8 @@ out:
if (locked_rwsem)
up_write(&mdsc->snap_rwsem);
+ ceph_dec_mds_stopping_blocker(mdsc);
+
if (close_sessions)
ceph_mdsc_close_sessions(mdsc);
return;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a5f52013314d..2d7f5a8d4a92 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -20,6 +20,7 @@
#include "super.h"
#include "mds_client.h"
#include "cache.h"
+#include "crypto.h"
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/decode.h>
@@ -46,6 +47,7 @@ static void ceph_put_super(struct super_block *s)
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
dout("put_super\n");
+ ceph_fscrypt_free_dummy_policy(fsc);
ceph_mdsc_close_sessions(fsc->mdsc);
}
@@ -151,6 +153,7 @@ enum {
Opt_recover_session,
Opt_source,
Opt_mon_addr,
+ Opt_test_dummy_encryption,
/* string args above */
Opt_dirstat,
Opt_rbytes,
@@ -165,6 +168,7 @@ enum {
Opt_copyfrom,
Opt_wsync,
Opt_pagecache,
+ Opt_sparseread,
};
enum ceph_recover_session_mode {
@@ -192,6 +196,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_string ("fsc", Opt_fscache), // fsc=...
fsparam_flag_no ("ino32", Opt_ino32),
fsparam_string ("mds_namespace", Opt_mds_namespace),
+ fsparam_string ("mon_addr", Opt_mon_addr),
fsparam_flag_no ("poolperm", Opt_poolperm),
fsparam_flag_no ("quotadf", Opt_quotadf),
fsparam_u32 ("rasize", Opt_rasize),
@@ -203,10 +208,12 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = {
fsparam_u32 ("rsize", Opt_rsize),
fsparam_string ("snapdirname", Opt_snapdirname),
fsparam_string ("source", Opt_source),
- fsparam_string ("mon_addr", Opt_mon_addr),
+ fsparam_flag ("test_dummy_encryption", Opt_test_dummy_encryption),
+ fsparam_string ("test_dummy_encryption", Opt_test_dummy_encryption),
fsparam_u32 ("wsize", Opt_wsize),
fsparam_flag_no ("wsync", Opt_wsync),
fsparam_flag_no ("pagecache", Opt_pagecache),
+ fsparam_flag_no ("sparseread", Opt_sparseread),
{}
};
@@ -576,6 +583,29 @@ static int ceph_parse_mount_param(struct fs_context *fc,
else
fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE;
break;
+ case Opt_sparseread:
+ if (result.negated)
+ fsopt->flags &= ~CEPH_MOUNT_OPT_SPARSEREAD;
+ else
+ fsopt->flags |= CEPH_MOUNT_OPT_SPARSEREAD;
+ break;
+ case Opt_test_dummy_encryption:
+#ifdef CONFIG_FS_ENCRYPTION
+ fscrypt_free_dummy_policy(&fsopt->dummy_enc_policy);
+ ret = fscrypt_parse_test_dummy_encryption(param,
+ &fsopt->dummy_enc_policy);
+ if (ret == -EINVAL) {
+ warnfc(fc, "Value of option \"%s\" is unrecognized",
+ param->key);
+ } else if (ret == -EEXIST) {
+ warnfc(fc, "Conflicting test_dummy_encryption options");
+ ret = -EINVAL;
+ }
+#else
+ warnfc(fc,
+ "FS encryption not supported: test_dummy_encryption mount option ignored");
+#endif
+ break;
default:
BUG();
}
@@ -596,6 +626,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
kfree(args->server_path);
kfree(args->fscache_uniq);
kfree(args->mon_addr);
+ fscrypt_free_dummy_policy(&args->dummy_enc_policy);
kfree(args);
}
@@ -710,9 +741,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS))
seq_puts(m, ",wsync");
-
if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE)
seq_puts(m, ",nopagecache");
+ if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
+ seq_puts(m, ",sparseread");
+
+ fscrypt_show_test_dummy_encryption(m, ',', root->d_sb);
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
seq_printf(m, ",wsize=%u", fsopt->wsize);
@@ -1052,6 +1086,50 @@ out:
return root;
}
+#ifdef CONFIG_FS_ENCRYPTION
+static int ceph_apply_test_dummy_encryption(struct super_block *sb,
+ struct fs_context *fc,
+ struct ceph_mount_options *fsopt)
+{
+ struct ceph_fs_client *fsc = sb->s_fs_info;
+
+ if (!fscrypt_is_dummy_policy_set(&fsopt->dummy_enc_policy))
+ return 0;
+
+ /* No changing encryption context on remount. */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+ !fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy,
+ &fsc->fsc_dummy_enc_policy))
+ return 0;
+ errorfc(fc, "Can't set test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+
+ /* Also make sure fsopt doesn't contain a conflicting value. */
+ if (fscrypt_is_dummy_policy_set(&fsc->fsc_dummy_enc_policy)) {
+ if (fscrypt_dummy_policies_equal(&fsopt->dummy_enc_policy,
+ &fsc->fsc_dummy_enc_policy))
+ return 0;
+ errorfc(fc, "Conflicting test_dummy_encryption options");
+ return -EINVAL;
+ }
+
+ fsc->fsc_dummy_enc_policy = fsopt->dummy_enc_policy;
+ memset(&fsopt->dummy_enc_policy, 0, sizeof(fsopt->dummy_enc_policy));
+
+ warnfc(fc, "test_dummy_encryption mode enabled");
+ return 0;
+}
+#else
+static int ceph_apply_test_dummy_encryption(struct super_block *sb,
+ struct fs_context *fc,
+ struct ceph_mount_options *fsopt)
+{
+ return 0;
+}
+#endif
+
/*
* mount: join the ceph cluster, and open root directory.
*/
@@ -1080,6 +1158,11 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto out;
}
+ err = ceph_apply_test_dummy_encryption(fsc->sb, fc,
+ fsc->mount_options);
+ if (err)
+ goto out;
+
dout("mount opening path '%s'\n", path);
ceph_fs_debugfs_init(fsc);
@@ -1101,6 +1184,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
out:
mutex_unlock(&fsc->client->mount_mutex);
+ ceph_fscrypt_free_dummy_policy(fsc);
return ERR_PTR(err);
}
@@ -1126,6 +1210,8 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
s->s_time_max = U32_MAX;
s->s_flags |= SB_NODIRATIME | SB_NOATIME;
+ ceph_fscrypt_set_ops(s);
+
ret = set_anon_super_fc(s, fc);
if (ret != 0)
fsc->sb = NULL;
@@ -1287,15 +1373,26 @@ static void ceph_free_fc(struct fs_context *fc)
static int ceph_reconfigure_fc(struct fs_context *fc)
{
+ int err;
struct ceph_parse_opts_ctx *pctx = fc->fs_private;
struct ceph_mount_options *fsopt = pctx->opts;
- struct ceph_fs_client *fsc = ceph_sb_to_client(fc->root->d_sb);
+ struct super_block *sb = fc->root->d_sb;
+ struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
+
+ err = ceph_apply_test_dummy_encryption(sb, fc, fsopt);
+ if (err)
+ return err;
if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
ceph_set_mount_opt(fsc, ASYNC_DIROPS);
else
ceph_clear_mount_opt(fsc, ASYNC_DIROPS);
+ if (fsopt->flags & CEPH_MOUNT_OPT_SPARSEREAD)
+ ceph_set_mount_opt(fsc, SPARSEREAD);
+ else
+ ceph_clear_mount_opt(fsc, SPARSEREAD);
+
if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) {
kfree(fsc->mount_options->mon_addr);
fsc->mount_options->mon_addr = fsopt->mon_addr;
@@ -1303,7 +1400,7 @@ static int ceph_reconfigure_fc(struct fs_context *fc)
pr_notice("ceph: monitor addresses recorded, but not used for reconnection");
}
- sync_filesystem(fc->root->d_sb);
+ sync_filesystem(sb);
return 0;
}
@@ -1365,25 +1462,101 @@ nomem:
return -ENOMEM;
}
+/*
+ * Return true if it successfully increases the blocker counter,
+ * or false if the mdsc is in stopping and flushed state.
+ */
+static bool __inc_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ spin_lock(&mdsc->stopping_lock);
+ if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING) {
+ spin_unlock(&mdsc->stopping_lock);
+ return false;
+ }
+ atomic_inc(&mdsc->stopping_blockers);
+ spin_unlock(&mdsc->stopping_lock);
+ return true;
+}
+
+static void __dec_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ spin_lock(&mdsc->stopping_lock);
+ if (!atomic_dec_return(&mdsc->stopping_blockers) &&
+ mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHING)
+ complete_all(&mdsc->stopping_waiter);
+ spin_unlock(&mdsc->stopping_lock);
+}
+
+/* For metadata IO requests */
+bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ mutex_lock(&session->s_mutex);
+ inc_session_sequence(session);
+ mutex_unlock(&session->s_mutex);
+
+ return __inc_stopping_blocker(mdsc);
+}
+
+void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ __dec_stopping_blocker(mdsc);
+}
+
+/* For data IO requests */
+bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ return __inc_stopping_blocker(mdsc);
+}
+
+void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc)
+{
+ __dec_stopping_blocker(mdsc);
+}
+
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ bool wait;
dout("kill_sb %p\n", s);
- ceph_mdsc_pre_umount(fsc->mdsc);
+ ceph_mdsc_pre_umount(mdsc);
flush_fs_workqueues(fsc);
/*
* Though the kill_anon_super() will finally trigger the
- * sync_filesystem() anyway, we still need to do it here
- * and then bump the stage of shutdown to stop the work
- * queue as earlier as possible.
+ * sync_filesystem() anyway, we still need to do it here and
+ * then bump the stage of shutdown. This will allow us to
+ * drop any further message, which will increase the inodes'
+ * i_count reference counters but makes no sense any more,
+ * from MDSs.
+ *
+ * Without this when evicting the inodes it may fail in the
+ * kill_anon_super(), which will trigger a warning when
+ * destroying the fscrypt keyring and then possibly trigger
+ * a further crash in ceph module when the iput() tries to
+ * evict the inodes later.
*/
sync_filesystem(s);
- fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
+ spin_lock(&mdsc->stopping_lock);
+ mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING;
+ wait = !!atomic_read(&mdsc->stopping_blockers);
+ spin_unlock(&mdsc->stopping_lock);
+
+ if (wait && atomic_read(&mdsc->stopping_blockers)) {
+ long timeleft = wait_for_completion_killable_timeout(
+ &mdsc->stopping_waiter,
+ fsc->client->options->mount_timeout);
+ if (!timeleft) /* timed out */
+ pr_warn("umount timed out, %ld\n", timeleft);
+ else if (timeleft < 0) /* killed */
+ pr_warn("umount was killed, %ld\n", timeleft);
+ }
+ mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3bfddf34d488..51c7f2b14f6f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -22,6 +22,7 @@
#include <linux/hashtable.h>
#include <linux/ceph/libceph.h>
+#include "crypto.h"
/* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */
@@ -42,6 +43,7 @@
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
#define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */
#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */
+#define CEPH_MOUNT_OPT_SPARSEREAD (1<<17) /* always do sparse reads */
#define CEPH_MOUNT_OPT_DEFAULT \
(CEPH_MOUNT_OPT_DCACHE | \
@@ -98,6 +100,7 @@ struct ceph_mount_options {
char *server_path; /* default NULL (means "/") */
char *fscache_uniq; /* default NULL */
char *mon_addr;
+ struct fscrypt_dummy_policy dummy_enc_policy;
};
/* mount state */
@@ -154,9 +157,11 @@ struct ceph_fs_client {
#ifdef CONFIG_CEPH_FSCACHE
struct fscache_volume *fscache;
#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ struct fscrypt_dummy_policy fsc_dummy_enc_policy;
+#endif
};
-
/*
* File i/o capability. This tracks shared state with the metadata
* server that allows us to cache or writeback attributes or to read
@@ -419,6 +424,11 @@ struct ceph_inode_info {
u32 i_truncate_seq; /* last truncate to smaller size */
u64 i_truncate_size; /* and the size we last truncated down to */
int i_truncate_pending; /* still need to call vmtruncate */
+ /*
+ * For none fscrypt case it equals to i_truncate_size or it will
+ * equals to fscrypt_file_size
+ */
+ u64 i_truncate_pagecache_size;
u64 i_max_size; /* max file size authorized by mds */
u64 i_reported_size; /* (max_)size reported to or requested of mds */
@@ -449,6 +459,13 @@ struct ceph_inode_info {
struct work_struct i_work;
unsigned long i_work_mask;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ u32 fscrypt_auth_len;
+ u32 fscrypt_file_len;
+ u8 *fscrypt_auth;
+ u8 *fscrypt_file;
+#endif
};
struct ceph_netfs_request_data {
@@ -998,6 +1015,7 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
/* inode.c */
struct ceph_mds_reply_info_in;
struct ceph_mds_reply_dirfrag;
+struct ceph_acl_sec_ctx;
extern const struct inode_operations ceph_file_iops;
@@ -1005,8 +1023,14 @@ extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_evict_inode(struct inode *inode);
extern void ceph_free_inode(struct inode *inode);
+struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
+ umode_t *mode, struct ceph_acl_sec_ctx *as_ctx);
+void ceph_as_ctx_to_req(struct ceph_mds_request *req,
+ struct ceph_acl_sec_ctx *as_ctx);
+
extern struct inode *ceph_get_inode(struct super_block *sb,
- struct ceph_vino vino);
+ struct ceph_vino vino,
+ struct inode *newino);
extern struct inode *ceph_get_snapdir(struct inode *parent);
extern int ceph_fill_file_size(struct inode *inode, int issued,
u32 truncate_seq, u64 truncate_size, u64 size);
@@ -1065,7 +1089,13 @@ static inline int ceph_do_getattr(struct inode *inode, int mask, bool force)
}
extern int ceph_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask);
-extern int __ceph_setattr(struct inode *inode, struct iattr *attr);
+
+struct ceph_iattr {
+ struct ceph_fscrypt_auth *fscrypt_auth;
+};
+
+extern int __ceph_setattr(struct inode *inode, struct iattr *attr,
+ struct ceph_iattr *cia);
extern int ceph_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr);
extern int ceph_getattr(struct mnt_idmap *idmap,
@@ -1100,6 +1130,9 @@ struct ceph_acl_sec_ctx {
void *sec_ctx;
u32 sec_ctxlen;
#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ struct ceph_fscrypt_auth *fscrypt_auth;
+#endif
struct ceph_pagelist *pagelist;
};
@@ -1237,6 +1270,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
struct inode *dir,
int mds, int drop, int unless);
+extern int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi,
+ int need, int want, loff_t endoff, int *got);
extern int ceph_get_caps(struct file *filp, int need, int want,
loff_t endoff, int *got);
extern int ceph_try_get_caps(struct inode *inode,
@@ -1272,6 +1307,9 @@ extern int ceph_renew_caps(struct inode *inode, int fmode);
extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode);
+extern ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
+ struct iov_iter *to, int *retry_op,
+ u64 *last_objver);
extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len);
@@ -1375,4 +1413,9 @@ extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc,
struct kstatfs *buf);
extern void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc);
+bool ceph_inc_mds_stopping_blocker(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+void ceph_dec_mds_stopping_blocker(struct ceph_mds_client *mdsc);
+bool ceph_inc_osd_stopping_blocker(struct ceph_mds_client *mdsc);
+void ceph_dec_osd_stopping_blocker(struct ceph_mds_client *mdsc);
#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 806183959c47..0deae4a0f5f1 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -352,6 +352,24 @@ static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci,
return ret;
}
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+static bool ceph_vxattrcb_fscrypt_auth_exists(struct ceph_inode_info *ci)
+{
+ return ci->fscrypt_auth_len;
+}
+
+static ssize_t ceph_vxattrcb_fscrypt_auth(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ if (size) {
+ if (size < ci->fscrypt_auth_len)
+ return -ERANGE;
+ memcpy(val, ci->fscrypt_auth, ci->fscrypt_auth_len);
+ }
+ return ci->fscrypt_auth_len;
+}
+#endif /* CONFIG_FS_ENCRYPTION */
+
#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
#define CEPH_XATTR_NAME2(_type, _name, _name2) \
XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
@@ -500,6 +518,15 @@ static struct ceph_vxattr ceph_common_vxattrs[] = {
.exists_cb = NULL,
.flags = VXATTR_FLAG_READONLY,
},
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+ {
+ .name = "ceph.fscrypt.auth",
+ .name_size = sizeof("ceph.fscrypt.auth"),
+ .getxattr_cb = ceph_vxattrcb_fscrypt_auth,
+ .exists_cb = ceph_vxattrcb_fscrypt_auth_exists,
+ .flags = VXATTR_FLAG_READONLY,
+ },
+#endif /* CONFIG_FS_ENCRYPTION */
{ .name = NULL, 0 } /* Required table terminator */
};
@@ -1238,7 +1265,7 @@ retry:
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf);
ci->i_xattrs.dirty = true;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
spin_unlock(&ci->i_ceph_lock);
@@ -1408,6 +1435,9 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
security_release_secctx(as_ctx->sec_ctx, as_ctx->sec_ctxlen);
#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ kfree(as_ctx->fscrypt_auth);
+#endif
if (as_ctx->pagelist)
ceph_pagelist_release(as_ctx->pagelist);
}
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 903ca8fa4b9b..ae023853a98f 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -127,7 +127,8 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr)
if (attr->va_mtime.tv_sec != -1)
inode->i_mtime = coda_to_timespec64(attr->va_mtime);
if (attr->va_ctime.tv_sec != -1)
- inode->i_ctime = coda_to_timespec64(attr->va_ctime);
+ inode_set_ctime_to_ts(inode,
+ coda_to_timespec64(attr->va_ctime));
}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 1b960de2bf39..cb512b10473b 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir)
/* optimistically we can also act as if our nose bleeds. The
* granularity of the mtime is coarse anyways so we might actually be
* right most of the time. Note: we only do this for directories. */
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
#endif
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 12b26bd13564..42346618b4ed 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
- coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode);
+ coda_inode->i_mtime = inode_set_ctime_current(coda_inode);
inode_unlock(coda_inode);
file_end_write(host_file);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d661e6cf17ac..0c7c2528791e 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -256,7 +256,8 @@ int coda_getattr(struct mnt_idmap *idmap, const struct path *path,
{
int err = coda_revalidate_inode(d_inode(path->dentry));
if (!err)
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask,
+ d_inode(path->dentry), stat);
return err;
}
@@ -269,7 +270,7 @@ int coda_setattr(struct mnt_idmap *idmap, struct dentry *de,
memset(&vattr, 0, sizeof(vattr));
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
coda_iattr_to_vattr(iattr, &vattr);
vattr.va_type = C_VNON; /* cannot set type */
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 1c15edbe70ff..fbdcb3582926 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -88,8 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
static inline void set_default_inode_attr(struct inode * inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
@@ -99,7 +98,7 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr)
inode->i_gid = iattr->ia_gid;
inode->i_atime = iattr->ia_atime;
inode->i_mtime = iattr->ia_mtime;
- inode->i_ctime = iattr->ia_ctime;
+ inode_set_ctime_to_ts(inode, iattr->ia_ctime);
}
struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
@@ -172,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode)
return ERR_PTR(-ENOMEM);
p_inode = d_inode(dentry->d_parent);
- p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode);
+ p_inode->i_mtime = inode_set_ctime_current(p_inode);
configfs_set_inode_lock_class(sd, inode);
return inode;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 27c6597aa1be..5ee7d7bbb361 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -133,7 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
}
/* Struct copy intentional */
- inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
+ zerotime);
/* inode->i_nlink is left 1 - arguably wrong for directories,
but it's the best we can do without reading the directory
contents. 1 yields the right result in GNU find, even
@@ -485,12 +486,16 @@ static void cramfs_kill_sb(struct super_block *sb)
{
struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+ generic_shutdown_super(sb);
+
if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) {
if (sbi && sbi->mtd_point_size)
mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size);
- kill_mtd_super(sb);
+ put_mtd_device(sb->s_mtd);
+ sb->s_mtd = NULL;
} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
- kill_block_super(sb);
+ sync_blockdev(sb->s_bdev);
+ blkdev_put(sb->s_bdev, sb);
}
kfree(sbi);
}
diff --git a/fs/dax.c b/fs/dax.c
index 906ecbd541a3..8fafecbe42b1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -30,17 +30,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/fs_dax.h>
-static inline unsigned int pe_order(enum page_entry_size pe_size)
-{
- if (pe_size == PE_SIZE_PTE)
- return PAGE_SHIFT - PAGE_SHIFT;
- if (pe_size == PE_SIZE_PMD)
- return PMD_SHIFT - PAGE_SHIFT;
- if (pe_size == PE_SIZE_PUD)
- return PUD_SHIFT - PAGE_SHIFT;
- return ~0;
-}
-
/* We choose 4096 entries - same as per-zone page wait tables */
#define DAX_WAIT_TABLE_BITS 12
#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -49,9 +38,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size)
#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
-/* The order of a PMD entry */
-#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
-
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
static int __init init_dax_wait_table(void)
@@ -1908,7 +1894,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
/**
* dax_iomap_fault - handle a page fault on a DAX file
* @vmf: The description of the fault
- * @pe_size: Size of the page to fault in
+ * @order: Order of the page to fault in
* @pfnp: PFN to insert for synchronous faults if fsync is required
* @iomap_errp: Storage for detailed error code in case of error
* @ops: Iomap ops passed from the file system
@@ -1918,17 +1904,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* has done all the necessary locking for page fault to proceed
* successfully.
*/
-vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
{
- switch (pe_size) {
- case PE_SIZE_PTE:
+ if (order == 0)
return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
- case PE_SIZE_PMD:
+ else if (order == PMD_ORDER)
return dax_iomap_pmd_fault(vmf, pfnp, ops);
- default:
+ else
return VM_FAULT_FALLBACK;
- }
}
EXPORT_SYMBOL_GPL(dax_iomap_fault);
@@ -1979,19 +1963,18 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
/**
* dax_finish_sync_fault - finish synchronous page fault
* @vmf: The description of the fault
- * @pe_size: Size of entry to be inserted
+ * @order: Order of entry to be inserted
* @pfn: PFN to insert
*
* This function ensures that the file range touched by the page fault is
* stored persistently on the media and handles inserting of appropriate page
* table entry.
*/
-vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size, pfn_t pfn)
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
+ pfn_t pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
- unsigned int order = pe_order(pe_size);
size_t len = PAGE_SIZE << order;
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
diff --git a/fs/dcache.c b/fs/dcache.c
index 52e6d5fdab6b..25ac74d30bff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1664,7 +1664,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
if (dentry == _data && dentry->d_lockref.count == 1)
return D_WALK_CONTINUE;
- printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
+ WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} "
" still in use (%d) [unmount of %s %s]\n",
dentry,
dentry->d_inode ?
@@ -1673,7 +1673,6 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
dentry->d_lockref.count,
dentry->d_sb->s_type->name,
dentry->d_sb->s_id);
- WARN_ON(1);
return D_WALK_CONTINUE;
}
@@ -3247,8 +3246,6 @@ void d_genocide(struct dentry *parent)
d_walk(parent, parent, d_genocide_kill);
}
-EXPORT_SYMBOL(d_genocide);
-
void d_tmpfile(struct file *file, struct inode *inode)
{
struct dentry *dentry = file->f_path.dentry;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index b7711888dd17..87b3753aa4b1 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -904,8 +904,52 @@ EXPORT_SYMBOL_GPL(debugfs_create_str);
static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
- /* This is really only for read-only strings */
- return -EINVAL;
+ struct dentry *dentry = F_DENTRY(file);
+ char *old, *new = NULL;
+ int pos = *ppos;
+ int r;
+
+ r = debugfs_file_get(dentry);
+ if (unlikely(r))
+ return r;
+
+ old = *(char **)file->private_data;
+
+ /* only allow strict concatenation */
+ r = -EINVAL;
+ if (pos && pos != strlen(old))
+ goto error;
+
+ r = -E2BIG;
+ if (pos + count + 1 > PAGE_SIZE)
+ goto error;
+
+ r = -ENOMEM;
+ new = kmalloc(pos + count + 1, GFP_KERNEL);
+ if (!new)
+ goto error;
+
+ if (pos)
+ memcpy(new, old, pos);
+
+ r = -EFAULT;
+ if (copy_from_user(new + pos, user_buf, count))
+ goto error;
+
+ new[pos + count] = '\0';
+ strim(new);
+
+ rcu_assign_pointer(*(char **)file->private_data, new);
+ synchronize_rcu();
+ kfree(old);
+
+ debugfs_file_put(dentry);
+ return count;
+
+error:
+ kfree(new);
+ debugfs_file_put(dentry);
+ return r;
}
static const struct file_operations fops_str = {
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3f81f73c241a..83e57e9f9fa0 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -72,8 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index fe3db0eda8e4..299c295a27a0 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb)
}
inode->i_ino = 2;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
mode = S_IFCHR|opts->ptmxmode;
init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
@@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
if (!inode)
goto fail;
inode->i_ino = 1;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -534,12 +534,12 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx)
/**
* devpts_pty_new -- create a new inode in /dev/pts/
- * @ptmx_inode: inode of the master
- * @device: major+minor of the node to be created
+ * @fsi: Filesystem info for this instance.
* @index: used as a name of the node
* @priv: what's given back by devpts_get_priv
*
- * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
+ * The dentry for the created inode is returned.
+ * Remove it from /dev/pts/ with devpts_pty_kill().
*/
struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
{
@@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
inode->i_ino = index + 3;
inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index));
sprintf(s, "%d", index);
@@ -580,7 +580,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
/**
* devpts_get_priv -- get private data for a slave
- * @pts_inode: inode of the slave
+ * @dentry: dentry of the slave
*
* Returns whatever was passed as priv in devpts_pty_new for a given inode.
*/
@@ -593,7 +593,7 @@ void *devpts_get_priv(struct dentry *dentry)
/**
* devpts_pty_kill -- remove inode form /dev/pts/
- * @inode: inode of the slave to be removed
+ * @dentry: dentry of the slave to be removed
*
* This is an inverse operation of devpts_pty_new.
*/
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 2beceff024e3..e55e0a2cd2e8 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -664,7 +664,7 @@ static ssize_t comm_addr_store(struct config_item *item, const char *buf,
memcpy(addr, buf, len);
- rv = dlm_lowcomms_addr(cm->nodeid, addr, len);
+ rv = dlm_midcomms_addr(cm->nodeid, addr, len);
if (rv) {
kfree(addr);
return rv;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index a1aca41c49d0..5aabcb6f0f15 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -18,6 +18,7 @@
#include "dlm_internal.h"
#include "midcomms.h"
#include "lock.h"
+#include "ast.h"
#define DLM_DEBUG_BUF_LEN 4096
static char debug_buf[DLM_DEBUG_BUF_LEN];
@@ -365,6 +366,52 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s)
unlock_rsb(r);
}
+static void print_format5_lock(struct seq_file *s, struct dlm_lkb *lkb)
+{
+ struct dlm_callback *cb;
+
+ /* lkb_id lkb_flags mode flags sb_status sb_flags */
+
+ spin_lock(&lkb->lkb_cb_lock);
+ list_for_each_entry(cb, &lkb->lkb_callbacks, list) {
+ seq_printf(s, "%x %x %d %x %d %x\n",
+ lkb->lkb_id,
+ dlm_iflags_val(lkb),
+ cb->mode,
+ cb->flags,
+ cb->sb_status,
+ cb->sb_flags);
+ }
+ spin_unlock(&lkb->lkb_cb_lock);
+}
+
+static void print_format5(struct dlm_rsb *r, struct seq_file *s)
+{
+ struct dlm_lkb *lkb;
+
+ lock_rsb(r);
+
+ list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
+ print_format5_lock(s, lkb);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
+ print_format5_lock(s, lkb);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+
+ list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) {
+ print_format5_lock(s, lkb);
+ if (seq_has_overflowed(s))
+ goto out;
+ }
+ out:
+ unlock_rsb(r);
+}
+
struct rsbtbl_iter {
struct dlm_rsb *rsb;
unsigned bucket;
@@ -408,6 +455,13 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
}
print_format4(ri->rsb, seq);
break;
+ case 5:
+ if (ri->header) {
+ seq_puts(seq, "lkb_id lkb_flags mode flags sb_status sb_flags\n");
+ ri->header = 0;
+ }
+ print_format5(ri->rsb, seq);
+ break;
}
return 0;
@@ -417,6 +471,7 @@ static const struct seq_operations format1_seq_ops;
static const struct seq_operations format2_seq_ops;
static const struct seq_operations format3_seq_ops;
static const struct seq_operations format4_seq_ops;
+static const struct seq_operations format5_seq_ops;
static void *table_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -448,6 +503,8 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
ri->format = 3;
if (seq->op == &format4_seq_ops)
ri->format = 4;
+ if (seq->op == &format5_seq_ops)
+ ri->format = 5;
tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep;
@@ -602,10 +659,18 @@ static const struct seq_operations format4_seq_ops = {
.show = table_seq_show,
};
+static const struct seq_operations format5_seq_ops = {
+ .start = table_seq_start,
+ .next = table_seq_next,
+ .stop = table_seq_stop,
+ .show = table_seq_show,
+};
+
static const struct file_operations format1_fops;
static const struct file_operations format2_fops;
static const struct file_operations format3_fops;
static const struct file_operations format4_fops;
+static const struct file_operations format5_fops;
static int table_open1(struct inode *inode, struct file *file)
{
@@ -683,7 +748,21 @@ static int table_open4(struct inode *inode, struct file *file)
struct seq_file *seq;
int ret;
- ret = seq_open(file, &format4_seq_ops);
+ ret = seq_open(file, &format5_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static int table_open5(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format5_seq_ops);
if (ret)
return ret;
@@ -725,6 +804,14 @@ static const struct file_operations format4_fops = {
.release = seq_release
};
+static const struct file_operations format5_fops = {
+ .owner = THIS_MODULE,
+ .open = table_open5,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release
+};
+
/*
* dump lkb's on the ls_waiters list
*/
@@ -793,6 +880,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls)
debugfs_remove(ls->ls_debug_locks_dentry);
debugfs_remove(ls->ls_debug_all_dentry);
debugfs_remove(ls->ls_debug_toss_dentry);
+ debugfs_remove(ls->ls_debug_queued_asts_dentry);
}
static int dlm_state_show(struct seq_file *file, void *offset)
@@ -936,6 +1024,17 @@ void dlm_create_debug_file(struct dlm_ls *ls)
dlm_root,
ls,
&waiters_fops);
+
+ /* format 5 */
+
+ memset(name, 0, sizeof(name));
+ snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_queued_asts", ls->ls_name);
+
+ ls->ls_debug_queued_asts_dentry = debugfs_create_file(name,
+ 0644,
+ dlm_root,
+ ls,
+ &format5_fops);
}
void __init dlm_register_debugfs(void)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index fb1981654bb2..f6acba4310a7 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -58,7 +58,7 @@ void dlm_recover_dir_nodeid(struct dlm_ls *ls)
up_read(&ls->ls_root_sem);
}
-int dlm_recover_directory(struct dlm_ls *ls)
+int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq)
{
struct dlm_member *memb;
char *b, *last_name = NULL;
@@ -90,7 +90,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
}
error = dlm_rcom_names(ls, memb->nodeid,
- last_name, last_len);
+ last_name, last_len, seq);
if (error)
goto out_free;
@@ -196,7 +196,8 @@ int dlm_recover_directory(struct dlm_ls *ls)
return error;
}
-static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
+static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, const char *name,
+ int len)
{
struct dlm_rsb *r;
uint32_t hash, bucket;
@@ -232,7 +233,7 @@ static struct dlm_rsb *find_rsb_root(struct dlm_ls *ls, char *name, int len)
for rsb's we're master of and whose directory node matches the requesting
node. inbuf is the rsb name last sent, inlen is the name's length */
-void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
+void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen,
char *outbuf, int outlen, int nodeid)
{
struct list_head *list;
@@ -245,9 +246,8 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
if (inlen > 1) {
r = find_rsb_root(ls, inbuf, inlen);
if (!r) {
- inbuf[inlen - 1] = '\0';
- log_error(ls, "copy_master_names from %d start %d %s",
- nodeid, inlen, inbuf);
+ log_error(ls, "copy_master_names from %d start %d %.*s",
+ nodeid, inlen, inlen, inbuf);
goto out;
}
list = r->res_root_list.next;
diff --git a/fs/dlm/dir.h b/fs/dlm/dir.h
index 03844d086be2..39ecb69d7ef3 100644
--- a/fs/dlm/dir.h
+++ b/fs/dlm/dir.h
@@ -15,9 +15,9 @@
int dlm_dir_nodeid(struct dlm_rsb *rsb);
int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash);
void dlm_recover_dir_nodeid(struct dlm_ls *ls);
-int dlm_recover_directory(struct dlm_ls *ls);
-void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen,
- char *outbuf, int outlen, int nodeid);
+int dlm_recover_directory(struct dlm_ls *ls, uint64_t seq);
+void dlm_copy_master_names(struct dlm_ls *ls, const char *inbuf, int inlen,
+ char *outbuf, int outlen, int nodeid);
#endif /* __DIR_DOT_H__ */
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index c8156770205e..dfc444dad329 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -598,6 +598,7 @@ struct dlm_ls {
struct dentry *ls_debug_locks_dentry; /* debugfs */
struct dentry *ls_debug_all_dentry; /* debugfs */
struct dentry *ls_debug_toss_dentry; /* debugfs */
+ struct dentry *ls_debug_queued_asts_dentry; /* debugfs */
wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
int ls_uevent_result;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index f511a9d7d416..652c51fbbf76 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -86,8 +86,8 @@ static int send_remove(struct dlm_rsb *r);
static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
- struct dlm_message *ms, bool local);
-static int receive_extralen(struct dlm_message *ms);
+ const struct dlm_message *ms, bool local);
+static int receive_extralen(const struct dlm_message *ms);
static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
static void toss_rsb(struct kref *kref);
@@ -984,8 +984,8 @@ static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_no
* . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0)
*/
-int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
- unsigned int flags, int *r_nodeid, int *result)
+int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
+ int len, unsigned int flags, int *r_nodeid, int *result)
{
struct dlm_rsb *r = NULL;
uint32_t hash, b;
@@ -1106,7 +1106,7 @@ static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash)
}
}
-void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len)
+void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len)
{
struct dlm_rsb *r = NULL;
uint32_t hash, b;
@@ -1459,7 +1459,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
set RESEND and dlm_recover_waiters_post() */
static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
- struct dlm_message *ms)
+ const struct dlm_message *ms)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int overlap_done = 0;
@@ -1557,8 +1557,8 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
/* Handles situations where we might be processing a "fake" or "local" reply in
which we can't try to take waiters_mutex again. */
-static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms,
- bool local)
+static int remove_from_waiters_ms(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
int error;
@@ -1800,7 +1800,7 @@ static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
/* lkb is process copy (pc) */
static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
- struct dlm_message *ms)
+ const struct dlm_message *ms)
{
int b;
@@ -1907,7 +1907,7 @@ static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
}
static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
- struct dlm_message *ms)
+ const struct dlm_message *ms)
{
set_lvb_lock_pc(r, lkb, ms);
_grant_lock(r, lkb);
@@ -1945,7 +1945,7 @@ static void munge_demoted(struct dlm_lkb *lkb)
lkb->lkb_grmode = DLM_LOCK_NL;
}
-static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void munge_altmode(struct dlm_lkb *lkb, const struct dlm_message *ms)
{
if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) &&
ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) {
@@ -3641,8 +3641,9 @@ static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
}
-static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
- int ret_nodeid, int rv)
+static int send_lookup_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms_in, int ret_nodeid,
+ int rv)
{
struct dlm_rsb *r = &ls->ls_local_rsb;
struct dlm_message *ms;
@@ -3667,14 +3668,15 @@ static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
of message, unlike the send side where we can safely send everything about
the lkb for any type of message */
-static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
+static void receive_flags(struct dlm_lkb *lkb, const struct dlm_message *ms)
{
lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
dlm_set_sbflags_val(lkb, le32_to_cpu(ms->m_sbflags));
dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags));
}
-static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
+static void receive_flags_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms,
bool local)
{
if (local)
@@ -3684,14 +3686,14 @@ static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
dlm_set_dflags_val(lkb, le32_to_cpu(ms->m_flags));
}
-static int receive_extralen(struct dlm_message *ms)
+static int receive_extralen(const struct dlm_message *ms)
{
return (le16_to_cpu(ms->m_header.h_length) -
sizeof(struct dlm_message));
}
static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_message *ms)
+ const struct dlm_message *ms)
{
int len;
@@ -3719,7 +3721,7 @@ static void fake_astfn(void *astparam)
}
static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_message *ms)
+ const struct dlm_message *ms)
{
lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
lkb->lkb_ownpid = le32_to_cpu(ms->m_pid);
@@ -3741,7 +3743,7 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
}
static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_message *ms)
+ const struct dlm_message *ms)
{
if (lkb->lkb_status != DLM_LKSTS_GRANTED)
return -EBUSY;
@@ -3756,7 +3758,7 @@ static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
}
static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_message *ms)
+ const struct dlm_message *ms)
{
if (receive_lvb(ls, lkb, ms))
return -ENOMEM;
@@ -3766,7 +3768,7 @@ static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
/* We fill in the local-lkb fields with the info that send_xxxx_reply()
uses to send a reply and that the remote end uses to process the reply. */
-static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms)
+static void setup_local_lkb(struct dlm_ls *ls, const struct dlm_message *ms)
{
struct dlm_lkb *lkb = &ls->ls_local_lkb;
lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
@@ -3776,7 +3778,7 @@ static void setup_local_lkb(struct dlm_ls *ls, struct dlm_message *ms)
/* This is called after the rsb is locked so that we can safely inspect
fields in the lkb. */
-static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
+static int validate_message(struct dlm_lkb *lkb, const struct dlm_message *ms)
{
int from = le32_to_cpu(ms->m_header.h_nodeid);
int error = 0;
@@ -3828,7 +3830,7 @@ out:
return error;
}
-static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_request(struct dlm_ls *ls, const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -3907,7 +3909,7 @@ static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
return error;
}
-static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_convert(struct dlm_ls *ls, const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -3963,7 +3965,7 @@ static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
return error;
}
-static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_unlock(struct dlm_ls *ls, const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -4015,7 +4017,7 @@ static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
return error;
}
-static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_cancel(struct dlm_ls *ls, const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -4051,7 +4053,7 @@ static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
return error;
}
-static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_grant(struct dlm_ls *ls, const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -4082,7 +4084,7 @@ static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
return 0;
}
-static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_bast(struct dlm_ls *ls, const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -4110,7 +4112,7 @@ static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
return 0;
}
-static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
+static void receive_lookup(struct dlm_ls *ls, const struct dlm_message *ms)
{
int len, error, ret_nodeid, from_nodeid, our_nodeid;
@@ -4130,7 +4132,7 @@ static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
send_lookup_reply(ls, ms, ret_nodeid, error);
}
-static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
+static void receive_remove(struct dlm_ls *ls, const struct dlm_message *ms)
{
char name[DLM_RESNAME_MAXLEN+1];
struct dlm_rsb *r;
@@ -4218,12 +4220,13 @@ static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
}
}
-static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
+static void receive_purge(struct dlm_ls *ls, const struct dlm_message *ms)
{
do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid));
}
-static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_request_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -4345,7 +4348,7 @@ static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
}
static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
- struct dlm_message *ms, bool local)
+ const struct dlm_message *ms, bool local)
{
/* this is the value returned from do_convert() on the master */
switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
@@ -4388,8 +4391,8 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
}
}
-static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
- bool local)
+static void _receive_convert_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
{
struct dlm_rsb *r = lkb->lkb_resource;
int error;
@@ -4412,7 +4415,8 @@ static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
put_rsb(r);
}
-static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_convert_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
@@ -4426,8 +4430,8 @@ static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
return 0;
}
-static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
- bool local)
+static void _receive_unlock_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
{
struct dlm_rsb *r = lkb->lkb_resource;
int error;
@@ -4463,7 +4467,8 @@ static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
put_rsb(r);
}
-static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_unlock_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
@@ -4477,8 +4482,8 @@ static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
return 0;
}
-static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
- bool local)
+static void _receive_cancel_reply(struct dlm_lkb *lkb,
+ const struct dlm_message *ms, bool local)
{
struct dlm_rsb *r = lkb->lkb_resource;
int error;
@@ -4515,7 +4520,8 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms,
put_rsb(r);
}
-static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static int receive_cancel_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
int error;
@@ -4529,7 +4535,8 @@ static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
return 0;
}
-static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
+static void receive_lookup_reply(struct dlm_ls *ls,
+ const struct dlm_message *ms)
{
struct dlm_lkb *lkb;
struct dlm_rsb *r;
@@ -4608,7 +4615,7 @@ static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
dlm_put_lkb(lkb);
}
-static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
+static void _receive_message(struct dlm_ls *ls, const struct dlm_message *ms,
uint32_t saved_seq)
{
int error = 0, noent = 0;
@@ -4744,7 +4751,7 @@ static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
requestqueue, to processing all the saved messages, to processing new
messages as they arrive. */
-static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
+static void dlm_receive_message(struct dlm_ls *ls, const struct dlm_message *ms,
int nodeid)
{
if (dlm_locking_stopped(ls)) {
@@ -4767,7 +4774,7 @@ static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
/* This is called by dlm_recoverd to process messages that were saved on
the requestqueue. */
-void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
+void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms,
uint32_t saved_seq)
{
_receive_message(ls, ms, saved_seq);
@@ -4778,9 +4785,9 @@ void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
standard locking activity) or an RCOM (recovery message sent as part of
lockspace recovery). */
-void dlm_receive_buffer(union dlm_packet *p, int nodeid)
+void dlm_receive_buffer(const union dlm_packet *p, int nodeid)
{
- struct dlm_header *hd = &p->header;
+ const struct dlm_header *hd = &p->header;
struct dlm_ls *ls;
int type = 0;
@@ -5334,7 +5341,7 @@ static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
/* needs at least dlm_rcom + rcom_lock */
static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
- struct dlm_rsb *r, struct dlm_rcom *rc)
+ struct dlm_rsb *r, const struct dlm_rcom *rc)
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
@@ -5384,7 +5391,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
back the rcom_lock struct we got but with the remid field filled in. */
/* needs at least dlm_rcom + rcom_lock */
-int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ __le32 *rl_remid, __le32 *rl_result)
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
struct dlm_rsb *r;
@@ -5393,6 +5401,9 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
int error;
+ /* init rl_remid with rcom lock rl_remid */
+ *rl_remid = rl->rl_remid;
+
if (rl->rl_parent_lkid) {
error = -EOPNOTSUPP;
goto out;
@@ -5448,7 +5459,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
out_remid:
/* this is the new value returned to the lock holder for
saving in its process-copy lkb */
- rl->rl_remid = cpu_to_le32(lkb->lkb_id);
+ *rl_remid = cpu_to_le32(lkb->lkb_id);
lkb->lkb_recover_seq = ls->ls_recover_seq;
@@ -5459,12 +5470,13 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
if (error && error != -EEXIST)
log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
from_nodeid, remid, error);
- rl->rl_result = cpu_to_le32(error);
+ *rl_result = cpu_to_le32(error);
return error;
}
/* needs at least dlm_rcom + rcom_lock */
-int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
+int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ uint64_t seq)
{
struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
struct dlm_rsb *r;
@@ -5509,7 +5521,7 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
result);
- dlm_send_rcom_lock(r, lkb);
+ dlm_send_rcom_lock(r, lkb, seq);
goto out;
case -EEXIST:
case 0:
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index aa5ad44d902b..b54e2cbbe6e2 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -12,11 +12,11 @@
#define __LOCK_DOT_H__
void dlm_dump_rsb(struct dlm_rsb *r);
-void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len);
+void dlm_dump_rsb_name(struct dlm_ls *ls, const char *name, int len);
void dlm_print_lkb(struct dlm_lkb *lkb);
-void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
+void dlm_receive_message_saved(struct dlm_ls *ls, const struct dlm_message *ms,
uint32_t saved_seq);
-void dlm_receive_buffer(union dlm_packet *p, int nodeid);
+void dlm_receive_buffer(const union dlm_packet *p, int nodeid);
int dlm_modes_compat(int mode1, int mode2);
void dlm_put_rsb(struct dlm_rsb *r);
void dlm_hold_rsb(struct dlm_rsb *r);
@@ -25,8 +25,8 @@ void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_lock_recovery_try(struct dlm_ls *ls);
void dlm_unlock_recovery(struct dlm_ls *ls);
-int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
- unsigned int flags, int *r_nodeid, int *result);
+int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, const char *name,
+ int len, unsigned int flags, int *r_nodeid, int *result);
int dlm_search_rsb_tree(struct rb_root *tree, const void *name, int len,
struct dlm_rsb **r_ret);
@@ -36,8 +36,10 @@ void dlm_purge_mstcpy_locks(struct dlm_rsb *r);
void dlm_recover_grant(struct dlm_ls *ls);
int dlm_recover_waiters_post(struct dlm_ls *ls);
void dlm_recover_waiters_pre(struct dlm_ls *ls);
-int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
-int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+int dlm_recover_master_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ __le32 *rl_remid, __le32 *rl_result);
+int dlm_recover_process_copy(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ uint64_t seq);
int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
uint32_t flags, void *name, unsigned int namelen);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9f14ea9f6322..f7bc22e74db2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -863,7 +863,6 @@ struct dlm_processed_nodes {
static void process_dlm_messages(struct work_struct *work)
{
struct processqueue_entry *pentry;
- LIST_HEAD(processed_nodes);
spin_lock(&processqueue_lock);
pentry = list_first_entry_or_null(&processqueue,
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 77d202e4a02a..be7909ead71b 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -18,7 +18,7 @@
#include "midcomms.h"
#include "lowcomms.h"
-int dlm_slots_version(struct dlm_header *h)
+int dlm_slots_version(const struct dlm_header *h)
{
if ((le32_to_cpu(h->h_version) & 0x0000FFFF) < DLM_HEADER_SLOTS)
return 0;
@@ -393,14 +393,9 @@ static void remove_remote_member(int nodeid)
dlm_midcomms_remove_member(nodeid);
}
-static void clear_members_cb(int nodeid)
-{
- remove_remote_member(nodeid);
-}
-
void dlm_clear_members(struct dlm_ls *ls)
{
- clear_memb_list(&ls->ls_nodes, clear_members_cb);
+ clear_memb_list(&ls->ls_nodes, remove_remote_member);
ls->ls_num_nodes = 0;
}
@@ -454,7 +449,7 @@ static void make_member_array(struct dlm_ls *ls)
/* send a status request to all members just to establish comms connections */
-static int ping_members(struct dlm_ls *ls)
+static int ping_members(struct dlm_ls *ls, uint64_t seq)
{
struct dlm_member *memb;
int error = 0;
@@ -464,7 +459,7 @@ static int ping_members(struct dlm_ls *ls)
error = -EINTR;
break;
}
- error = dlm_rcom_status(ls, memb->nodeid, 0);
+ error = dlm_rcom_status(ls, memb->nodeid, 0, seq);
if (error)
break;
}
@@ -612,7 +607,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
make_member_array(ls);
*neg_out = neg;
- error = ping_members(ls);
+ error = ping_members(ls, rv->seq);
log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 433b2fac9f4a..f61cfde46314 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -18,7 +18,7 @@ void dlm_clear_members_gone(struct dlm_ls *ls);
int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
int dlm_is_removed(struct dlm_ls *ls, int nodeid);
int dlm_is_member(struct dlm_ls *ls, int nodeid);
-int dlm_slots_version(struct dlm_header *h);
+int dlm_slots_version(const struct dlm_header *h);
void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
struct dlm_member *memb);
void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index e1a0df67b566..f641b36a36db 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -330,18 +330,23 @@ static void midcomms_node_reset(struct midcomms_node *node)
wake_up(&node->shutdown_wait);
}
-static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc)
+static struct midcomms_node *nodeid2node(int nodeid)
{
- struct midcomms_node *node, *tmp;
- int r = nodeid_hash(nodeid);
+ return __find_node(nodeid, nodeid_hash(nodeid));
+}
+
+int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
+{
+ int ret, r = nodeid_hash(nodeid);
+ struct midcomms_node *node;
- node = __find_node(nodeid, r);
- if (node || !alloc)
- return node;
+ ret = dlm_lowcomms_addr(nodeid, addr, len);
+ if (ret)
+ return ret;
- node = kmalloc(sizeof(*node), alloc);
+ node = kmalloc(sizeof(*node), GFP_NOFS);
if (!node)
- return NULL;
+ return -ENOMEM;
node->nodeid = nodeid;
spin_lock_init(&node->state_lock);
@@ -353,21 +358,11 @@ static struct midcomms_node *nodeid2node(int nodeid, gfp_t alloc)
midcomms_node_reset(node);
spin_lock(&nodes_lock);
- /* check again if there was somebody else
- * earlier here to add the node
- */
- tmp = __find_node(nodeid, r);
- if (tmp) {
- spin_unlock(&nodes_lock);
- kfree(node);
- return tmp;
- }
-
hlist_add_head_rcu(&node->hlist, &node_hash[r]);
spin_unlock(&nodes_lock);
node->debugfs = dlm_create_debug_comms_file(nodeid, node);
- return node;
+ return 0;
}
static int dlm_send_ack(int nodeid, uint32_t seq)
@@ -499,7 +494,8 @@ static void dlm_pas_fin_ack_rcv(struct midcomms_node *node)
spin_unlock(&node->state_lock);
}
-static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p)
+static void dlm_receive_buffer_3_2_trace(uint32_t seq,
+ const union dlm_packet *p)
{
switch (p->header.h_cmd) {
case DLM_MSG:
@@ -513,7 +509,7 @@ static void dlm_receive_buffer_3_2_trace(uint32_t seq, union dlm_packet *p)
}
}
-static void dlm_midcomms_receive_buffer(union dlm_packet *p,
+static void dlm_midcomms_receive_buffer(const union dlm_packet *p,
struct midcomms_node *node,
uint32_t seq)
{
@@ -602,113 +598,8 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p,
}
}
-static struct midcomms_node *
-dlm_midcomms_recv_node_lookup(int nodeid, const union dlm_packet *p,
- uint16_t msglen, int (*cb)(struct midcomms_node *node))
-{
- struct midcomms_node *node = NULL;
- gfp_t allocation = 0;
- int ret;
-
- switch (p->header.h_cmd) {
- case DLM_RCOM:
- if (msglen < sizeof(struct dlm_rcom)) {
- log_print("rcom msg too small: %u, will skip this message from node %d",
- msglen, nodeid);
- return NULL;
- }
-
- switch (p->rcom.rc_type) {
- case cpu_to_le32(DLM_RCOM_NAMES):
- fallthrough;
- case cpu_to_le32(DLM_RCOM_NAMES_REPLY):
- fallthrough;
- case cpu_to_le32(DLM_RCOM_STATUS):
- fallthrough;
- case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
- node = nodeid2node(nodeid, 0);
- if (node) {
- spin_lock(&node->state_lock);
- if (node->state != DLM_ESTABLISHED)
- pr_debug("receive begin RCOM msg from node %d with state %s\n",
- node->nodeid, dlm_state_str(node->state));
-
- switch (node->state) {
- case DLM_CLOSED:
- node->state = DLM_ESTABLISHED;
- pr_debug("switch node %d to state %s\n",
- node->nodeid, dlm_state_str(node->state));
- break;
- case DLM_ESTABLISHED:
- break;
- default:
- spin_unlock(&node->state_lock);
- return NULL;
- }
- spin_unlock(&node->state_lock);
- }
-
- allocation = GFP_NOFS;
- break;
- default:
- break;
- }
-
- break;
- default:
- break;
- }
-
- node = nodeid2node(nodeid, allocation);
- if (!node) {
- switch (p->header.h_cmd) {
- case DLM_OPTS:
- if (msglen < sizeof(struct dlm_opts)) {
- log_print("opts msg too small: %u, will skip this message from node %d",
- msglen, nodeid);
- return NULL;
- }
-
- log_print_ratelimited("received dlm opts message nextcmd %d from node %d in an invalid sequence",
- p->opts.o_nextcmd, nodeid);
- break;
- default:
- log_print_ratelimited("received dlm message cmd %d from node %d in an invalid sequence",
- p->header.h_cmd, nodeid);
- break;
- }
-
- return NULL;
- }
-
- ret = cb(node);
- if (ret < 0)
- return NULL;
-
- return node;
-}
-
-static int dlm_midcomms_version_check_3_2(struct midcomms_node *node)
-{
- switch (node->version) {
- case DLM_VERSION_NOT_SET:
- node->version = DLM_VERSION_3_2;
- wake_up(&node->shutdown_wait);
- log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2,
- node->nodeid);
- break;
- case DLM_VERSION_3_2:
- break;
- default:
- log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
- DLM_VERSION_3_2, node->nodeid, node->version);
- return -1;
- }
-
- return 0;
-}
-
-static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodeid)
+static int dlm_opts_check_msglen(const union dlm_packet *p, uint16_t msglen,
+ int nodeid)
{
int len = msglen;
@@ -757,7 +648,7 @@ static int dlm_opts_check_msglen(union dlm_packet *p, uint16_t msglen, int nodei
return 0;
}
-static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
+static void dlm_midcomms_receive_buffer_3_2(const union dlm_packet *p, int nodeid)
{
uint16_t msglen = le16_to_cpu(p->header.h_length);
struct midcomms_node *node;
@@ -765,10 +656,37 @@ static void dlm_midcomms_receive_buffer_3_2(union dlm_packet *p, int nodeid)
int ret, idx;
idx = srcu_read_lock(&nodes_srcu);
- node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen,
- dlm_midcomms_version_check_3_2);
- if (!node)
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node))
+ goto out;
+
+ switch (node->version) {
+ case DLM_VERSION_NOT_SET:
+ node->version = DLM_VERSION_3_2;
+ wake_up(&node->shutdown_wait);
+ log_print("version 0x%08x for node %d detected", DLM_VERSION_3_2,
+ node->nodeid);
+
+ spin_lock(&node->state_lock);
+ switch (node->state) {
+ case DLM_CLOSED:
+ node->state = DLM_ESTABLISHED;
+ pr_debug("switch node %d to state %s\n",
+ node->nodeid, dlm_state_str(node->state));
+ break;
+ default:
+ break;
+ }
+ spin_unlock(&node->state_lock);
+
+ break;
+ case DLM_VERSION_3_2:
+ break;
+ default:
+ log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
+ DLM_VERSION_3_2, node->nodeid, node->version);
goto out;
+ }
switch (p->header.h_cmd) {
case DLM_RCOM:
@@ -858,8 +776,19 @@ out:
srcu_read_unlock(&nodes_srcu, idx);
}
-static int dlm_midcomms_version_check_3_1(struct midcomms_node *node)
+static void dlm_midcomms_receive_buffer_3_1(const union dlm_packet *p, int nodeid)
{
+ uint16_t msglen = le16_to_cpu(p->header.h_length);
+ struct midcomms_node *node;
+ int idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node)) {
+ srcu_read_unlock(&nodes_srcu, idx);
+ return;
+ }
+
switch (node->version) {
case DLM_VERSION_NOT_SET:
node->version = DLM_VERSION_3_1;
@@ -872,22 +801,6 @@ static int dlm_midcomms_version_check_3_1(struct midcomms_node *node)
default:
log_print_ratelimited("version mismatch detected, assumed 0x%08x but node %d has 0x%08x",
DLM_VERSION_3_1, node->nodeid, node->version);
- return -1;
- }
-
- return 0;
-}
-
-static void dlm_midcomms_receive_buffer_3_1(union dlm_packet *p, int nodeid)
-{
- uint16_t msglen = le16_to_cpu(p->header.h_length);
- struct midcomms_node *node;
- int idx;
-
- idx = srcu_read_lock(&nodes_srcu);
- node = dlm_midcomms_recv_node_lookup(nodeid, p, msglen,
- dlm_midcomms_version_check_3_1);
- if (!node) {
srcu_read_unlock(&nodes_srcu, idx);
return;
}
@@ -977,10 +890,10 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
switch (hd->h_version) {
case cpu_to_le32(DLM_VERSION_3_1):
- dlm_midcomms_receive_buffer_3_1((union dlm_packet *)ptr, nodeid);
+ dlm_midcomms_receive_buffer_3_1((const union dlm_packet *)ptr, nodeid);
break;
case cpu_to_le32(DLM_VERSION_3_2):
- dlm_midcomms_receive_buffer_3_2((union dlm_packet *)ptr, nodeid);
+ dlm_midcomms_receive_buffer_3_2((const union dlm_packet *)ptr, nodeid);
break;
default:
log_print("received invalid version header: %u from node %d, will skip this message",
@@ -1003,8 +916,8 @@ void dlm_midcomms_unack_msg_resend(int nodeid)
int idx, ret;
idx = srcu_read_lock(&nodes_srcu);
- node = nodeid2node(nodeid, 0);
- if (!node) {
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node)) {
srcu_read_unlock(&nodes_srcu, idx);
return;
}
@@ -1090,11 +1003,9 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
int idx;
idx = srcu_read_lock(&nodes_srcu);
- node = nodeid2node(nodeid, 0);
- if (!node) {
- WARN_ON_ONCE(1);
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node))
goto err;
- }
/* this is a bug, however we going on and hope it will be resolved */
WARN_ON_ONCE(test_bit(DLM_NODE_FLAG_STOP_TX, &node->flags));
@@ -1235,8 +1146,34 @@ void dlm_midcomms_init(void)
dlm_lowcomms_init();
}
+static void midcomms_node_release(struct rcu_head *rcu)
+{
+ struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
+
+ WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
+ dlm_send_queue_flush(node);
+ kfree(node);
+}
+
void dlm_midcomms_exit(void)
{
+ struct midcomms_node *node;
+ int i, idx;
+
+ idx = srcu_read_lock(&nodes_srcu);
+ for (i = 0; i < CONN_HASH_SIZE; i++) {
+ hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
+ dlm_delete_debug_comms_file(node->debugfs);
+
+ spin_lock(&nodes_lock);
+ hlist_del_rcu(&node->hlist);
+ spin_unlock(&nodes_lock);
+
+ call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release);
+ }
+ }
+ srcu_read_unlock(&nodes_srcu, idx);
+
dlm_lowcomms_exit();
}
@@ -1277,8 +1214,8 @@ void dlm_midcomms_add_member(int nodeid)
int idx;
idx = srcu_read_lock(&nodes_srcu);
- node = nodeid2node(nodeid, GFP_NOFS);
- if (!node) {
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node)) {
srcu_read_unlock(&nodes_srcu, idx);
return;
}
@@ -1322,8 +1259,8 @@ void dlm_midcomms_remove_member(int nodeid)
int idx;
idx = srcu_read_lock(&nodes_srcu);
- node = nodeid2node(nodeid, 0);
- if (!node) {
+ node = nodeid2node(nodeid);
+ if (WARN_ON_ONCE(!node)) {
srcu_read_unlock(&nodes_srcu, idx);
return;
}
@@ -1367,15 +1304,6 @@ void dlm_midcomms_remove_member(int nodeid)
srcu_read_unlock(&nodes_srcu, idx);
}
-static void midcomms_node_release(struct rcu_head *rcu)
-{
- struct midcomms_node *node = container_of(rcu, struct midcomms_node, rcu);
-
- WARN_ON_ONCE(atomic_read(&node->send_queue_cnt));
- dlm_send_queue_flush(node);
- kfree(node);
-}
-
void dlm_midcomms_version_wait(void)
{
struct midcomms_node *node;
@@ -1438,7 +1366,7 @@ static void midcomms_shutdown(struct midcomms_node *node)
node->state == DLM_CLOSED ||
test_bit(DLM_NODE_FLAG_CLOSE, &node->flags),
DLM_SHUTDOWN_TIMEOUT);
- if (!ret || test_bit(DLM_NODE_FLAG_CLOSE, &node->flags))
+ if (!ret)
pr_debug("active shutdown timed out for node %d with state %s\n",
node->nodeid, dlm_state_str(node->state));
else
@@ -1456,14 +1384,6 @@ void dlm_midcomms_shutdown(void)
for (i = 0; i < CONN_HASH_SIZE; i++) {
hlist_for_each_entry_rcu(node, &node_hash[i], hlist) {
midcomms_shutdown(node);
-
- dlm_delete_debug_comms_file(node->debugfs);
-
- spin_lock(&nodes_lock);
- hlist_del_rcu(&node->hlist);
- spin_unlock(&nodes_lock);
-
- call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release);
}
}
srcu_read_unlock(&nodes_srcu, idx);
@@ -1479,7 +1399,7 @@ int dlm_midcomms_close(int nodeid)
idx = srcu_read_lock(&nodes_srcu);
/* Abort pending close/remove operation */
- node = nodeid2node(nodeid, 0);
+ node = nodeid2node(nodeid);
if (node) {
/* let shutdown waiters leave */
set_bit(DLM_NODE_FLAG_CLOSE, &node->flags);
@@ -1489,20 +1409,32 @@ int dlm_midcomms_close(int nodeid)
synchronize_srcu(&nodes_srcu);
- idx = srcu_read_lock(&nodes_srcu);
mutex_lock(&close_lock);
- node = nodeid2node(nodeid, 0);
+ idx = srcu_read_lock(&nodes_srcu);
+ node = nodeid2node(nodeid);
if (!node) {
- mutex_unlock(&close_lock);
srcu_read_unlock(&nodes_srcu, idx);
+ mutex_unlock(&close_lock);
return dlm_lowcomms_close(nodeid);
}
ret = dlm_lowcomms_close(nodeid);
- spin_lock(&node->state_lock);
- midcomms_node_reset(node);
- spin_unlock(&node->state_lock);
+ dlm_delete_debug_comms_file(node->debugfs);
+
+ spin_lock(&nodes_lock);
+ hlist_del_rcu(&node->hlist);
+ spin_unlock(&nodes_lock);
srcu_read_unlock(&nodes_srcu, idx);
+
+ /* wait that all readers left until flush send queue */
+ synchronize_srcu(&nodes_srcu);
+
+ /* drop all pending dlm messages, this is fine as
+ * this function get called when the node is fenced
+ */
+ dlm_send_queue_flush(node);
+
+ call_srcu(&nodes_srcu, &node->rcu, midcomms_node_release);
mutex_unlock(&close_lock);
return ret;
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 9f8c9605013d..e7246fb3ef57 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
gfp_t allocation, char **ppc);
void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
int namelen);
+int dlm_midcomms_addr(int nodeid, struct sockaddr_storage *addr, int len);
void dlm_midcomms_version_wait(void);
int dlm_midcomms_close(int nodeid);
int dlm_midcomms_start(void);
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 70a4752ed913..e6b4c1a21446 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -11,6 +11,8 @@
#include <linux/dlm_plock.h>
#include <linux/slab.h>
+#include <trace/events/dlm.h>
+
#include "dlm_internal.h"
#include "lockspace.h"
@@ -42,6 +44,27 @@ static inline void set_version(struct dlm_plock_info *info)
info->version[2] = DLM_PLOCK_VERSION_PATCH;
}
+static struct plock_op *plock_lookup_waiter(const struct dlm_plock_info *info)
+{
+ struct plock_op *op = NULL, *iter;
+
+ list_for_each_entry(iter, &recv_list, list) {
+ if (iter->info.fsid == info->fsid &&
+ iter->info.number == info->number &&
+ iter->info.owner == info->owner &&
+ iter->info.pid == info->pid &&
+ iter->info.start == info->start &&
+ iter->info.end == info->end &&
+ iter->info.ex == info->ex &&
+ iter->info.wait) {
+ op = iter;
+ break;
+ }
+ }
+
+ return op;
+}
+
static int check_version(struct dlm_plock_info *info)
{
if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
@@ -74,30 +97,26 @@ static void send_op(struct plock_op *op)
wake_up(&send_wq);
}
-/* If a process was killed while waiting for the only plock on a file,
- locks_remove_posix will not see any lock on the file so it won't
- send an unlock-close to us to pass on to userspace to clean up the
- abandoned waiter. So, we have to insert the unlock-close when the
- lock call is interrupted. */
-
-static void do_unlock_close(const struct dlm_plock_info *info)
+static int do_lock_cancel(const struct dlm_plock_info *orig_info)
{
struct plock_op *op;
+ int rv;
op = kzalloc(sizeof(*op), GFP_NOFS);
if (!op)
- return;
+ return -ENOMEM;
+
+ op->info = *orig_info;
+ op->info.optype = DLM_PLOCK_OP_CANCEL;
+ op->info.wait = 0;
- op->info.optype = DLM_PLOCK_OP_UNLOCK;
- op->info.pid = info->pid;
- op->info.fsid = info->fsid;
- op->info.number = info->number;
- op->info.start = 0;
- op->info.end = OFFSET_MAX;
- op->info.owner = info->owner;
-
- op->info.flags |= DLM_PLOCK_FL_CLOSE;
send_op(op);
+ wait_event(recv_wq, (op->done != 0));
+
+ rv = op->info.rv;
+
+ dlm_release_plock_op(op);
+ return rv;
}
int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
@@ -156,7 +175,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
if (op->info.wait) {
- rv = wait_event_killable(recv_wq, (op->done != 0));
+ rv = wait_event_interruptible(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
spin_lock(&ops_lock);
/* recheck under ops_lock if we got a done != 0,
@@ -166,17 +185,37 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
spin_unlock(&ops_lock);
goto do_lock_wait;
}
- list_del(&op->list);
spin_unlock(&ops_lock);
+ rv = do_lock_cancel(&op->info);
+ switch (rv) {
+ case 0:
+ /* waiter was deleted in user space, answer will never come
+ * remove original request. The original request must be
+ * on recv_list because the answer of do_lock_cancel()
+ * synchronized it.
+ */
+ spin_lock(&ops_lock);
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ rv = -EINTR;
+ break;
+ case -ENOENT:
+ /* cancellation wasn't successful but op should be done */
+ fallthrough;
+ default:
+ /* internal error doing cancel we need to wait */
+ goto wait;
+ }
+
log_debug(ls, "%s: wait interrupted %x %llx pid %d",
__func__, ls->ls_global_id,
(unsigned long long)number, op->info.pid);
- do_unlock_close(&op->info);
dlm_release_plock_op(op);
goto out;
}
} else {
+wait:
wait_event(recv_wq, (op->done != 0));
}
@@ -240,8 +279,8 @@ static int dlm_plock_callback(struct plock_op *op)
rv = notify(fl, 0);
if (rv) {
/* XXX: We need to cancel the fs lock here: */
- log_print("dlm_plock_callback: lock granted after lock request "
- "failed; dangling lock!\n");
+ log_print("%s: lock granted after lock request failed; dangling lock!",
+ __func__);
goto out;
}
@@ -318,6 +357,75 @@ out:
}
EXPORT_SYMBOL_GPL(dlm_posix_unlock);
+/*
+ * NOTE: This implementation can only handle async lock requests as nfs
+ * do it. It cannot handle cancellation of a pending lock request sitting
+ * in wait_event(), but for now only nfs is the only user local kernel
+ * user.
+ */
+int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file,
+ struct file_lock *fl)
+{
+ struct dlm_plock_info info;
+ struct plock_op *op;
+ struct dlm_ls *ls;
+ int rv;
+
+ /* this only works for async request for now and nfs is the only
+ * kernel user right now.
+ */
+ if (WARN_ON_ONCE(!fl->fl_lmops || !fl->fl_lmops->lm_grant))
+ return -EOPNOTSUPP;
+
+ ls = dlm_find_lockspace_local(lockspace);
+ if (!ls)
+ return -EINVAL;
+
+ memset(&info, 0, sizeof(info));
+ info.pid = fl->fl_pid;
+ info.ex = (fl->fl_type == F_WRLCK);
+ info.fsid = ls->ls_global_id;
+ dlm_put_lockspace(ls);
+ info.number = number;
+ info.start = fl->fl_start;
+ info.end = fl->fl_end;
+ info.owner = (__u64)fl->fl_pid;
+
+ rv = do_lock_cancel(&info);
+ switch (rv) {
+ case 0:
+ spin_lock(&ops_lock);
+ /* lock request to cancel must be on recv_list because
+ * do_lock_cancel() synchronizes it.
+ */
+ op = plock_lookup_waiter(&info);
+ if (WARN_ON_ONCE(!op)) {
+ spin_unlock(&ops_lock);
+ rv = -ENOLCK;
+ break;
+ }
+
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK);
+ op->data->callback(op->data->fl, -EINTR);
+ dlm_release_plock_op(op);
+ rv = -EINTR;
+ break;
+ case -ENOENT:
+ /* if cancel wasn't successful we probably were to late
+ * or it was a non-blocking lock request, so just unlock it.
+ */
+ rv = dlm_posix_unlock(lockspace, number, file, fl);
+ break;
+ default:
+ break;
+ }
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(dlm_posix_cancel);
+
int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
struct file_lock *fl)
{
@@ -403,6 +511,8 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
if (!op)
return -EAGAIN;
+ trace_dlm_plock_read(&info);
+
/* there is no need to get a reply from userspace for unlocks
that were generated by the vfs cleaning up for a close
(the process did not make an unlock call). */
@@ -430,6 +540,8 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
if (copy_from_user(&info, u, sizeof(info)))
return -EFAULT;
+ trace_dlm_plock_write(&info);
+
if (check_version(&info))
return -EINVAL;
@@ -441,22 +553,11 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
*/
spin_lock(&ops_lock);
if (info.wait) {
- list_for_each_entry(iter, &recv_list, list) {
- if (iter->info.fsid == info.fsid &&
- iter->info.number == info.number &&
- iter->info.owner == info.owner &&
- iter->info.pid == info.pid &&
- iter->info.start == info.start &&
- iter->info.end == info.end &&
- iter->info.ex == info.ex &&
- iter->info.wait) {
- op = iter;
- break;
- }
- }
+ op = plock_lookup_waiter(&info);
} else {
list_for_each_entry(iter, &recv_list, list) {
- if (!iter->info.wait) {
+ if (!iter->info.wait &&
+ iter->info.fsid == info.fsid) {
op = iter;
break;
}
@@ -468,8 +569,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
if (info.wait)
WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK);
else
- WARN_ON(op->info.fsid != info.fsid ||
- op->info.number != info.number ||
+ WARN_ON(op->info.number != info.number ||
op->info.owner != info.owner ||
op->info.optype != info.optype);
@@ -534,5 +634,7 @@ int dlm_plock_init(void)
void dlm_plock_exit(void)
{
misc_deregister(&plock_dev_misc);
+ WARN_ON(!list_empty(&send_list));
+ WARN_ON(!list_empty(&recv_list));
}
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f4afdf892f78..3b734aed26b5 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -28,7 +28,8 @@ static int rcom_response(struct dlm_ls *ls)
}
static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
- struct dlm_rcom **rc_ret, char *mb, int mb_len)
+ struct dlm_rcom **rc_ret, char *mb, int mb_len,
+ uint64_t seq)
{
struct dlm_rcom *rc;
@@ -41,16 +42,14 @@ static void _create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
rc->rc_header.h_cmd = DLM_RCOM;
rc->rc_type = cpu_to_le32(type);
-
- spin_lock(&ls->ls_recover_lock);
- rc->rc_seq = cpu_to_le64(ls->ls_recover_seq);
- spin_unlock(&ls->ls_recover_lock);
+ rc->rc_seq = cpu_to_le64(seq);
*rc_ret = rc;
}
static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
- struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret)
+ struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret,
+ uint64_t seq)
{
int mb_len = sizeof(struct dlm_rcom) + len;
struct dlm_mhandle *mh;
@@ -63,14 +62,14 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
return -ENOBUFS;
}
- _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len);
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq);
*mh_ret = mh;
return 0;
}
static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
int len, struct dlm_rcom **rc_ret,
- struct dlm_msg **msg_ret)
+ struct dlm_msg **msg_ret, uint64_t seq)
{
int mb_len = sizeof(struct dlm_rcom) + len;
struct dlm_msg *msg;
@@ -84,7 +83,7 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type,
return -ENOBUFS;
}
- _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len);
+ _create_rcom(ls, to_nodeid, type, len, rc_ret, mb, mb_len, seq);
*msg_ret = msg;
return 0;
}
@@ -170,7 +169,8 @@ static void disallow_sync_reply(struct dlm_ls *ls)
* node's rcom_config.
*/
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags,
+ uint64_t seq)
{
struct dlm_rcom *rc;
struct dlm_msg *msg;
@@ -186,7 +186,8 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
retry:
error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS,
- sizeof(struct rcom_status), &rc, &msg);
+ sizeof(struct rcom_status), &rc, &msg,
+ seq);
if (error)
goto out;
@@ -220,7 +221,9 @@ retry:
return error;
}
-static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+static void receive_rcom_status(struct dlm_ls *ls,
+ const struct dlm_rcom *rc_in,
+ uint64_t seq)
{
struct dlm_rcom *rc;
struct rcom_status *rs;
@@ -251,7 +254,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
do_create:
error = create_rcom_stateless(ls, nodeid, DLM_RCOM_STATUS_REPLY,
- len, &rc, &msg);
+ len, &rc, &msg, seq);
if (error)
return;
@@ -281,7 +284,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
send_rcom_stateless(msg, rc);
}
-static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+static void receive_sync_reply(struct dlm_ls *ls, const struct dlm_rcom *rc_in)
{
spin_lock(&ls->ls_rcom_spin);
if (!test_bit(LSFL_RCOM_WAIT, &ls->ls_flags) ||
@@ -302,17 +305,18 @@ static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
spin_unlock(&ls->ls_rcom_spin);
}
-int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len)
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,
+ int last_len, uint64_t seq)
{
+ struct dlm_mhandle *mh;
struct dlm_rcom *rc;
- struct dlm_msg *msg;
int error = 0;
ls->ls_recover_nodeid = nodeid;
retry:
- error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES, last_len,
- &rc, &msg);
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len,
+ &rc, &mh, seq);
if (error)
goto out;
memcpy(rc->rc_buf, last_name, last_len);
@@ -320,7 +324,7 @@ retry:
allow_sync_reply(ls, &rc->rc_id);
memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE);
- send_rcom_stateless(msg, rc);
+ send_rcom(mh, rc);
error = dlm_wait_function(ls, &rcom_response);
disallow_sync_reply(ls);
@@ -330,19 +334,20 @@ retry:
return error;
}
-static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+static void receive_rcom_names(struct dlm_ls *ls, const struct dlm_rcom *rc_in,
+ uint64_t seq)
{
+ struct dlm_mhandle *mh;
struct dlm_rcom *rc;
int error, inlen, outlen, nodeid;
- struct dlm_msg *msg;
nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
inlen = le16_to_cpu(rc_in->rc_header.h_length) -
sizeof(struct dlm_rcom);
outlen = DLM_MAX_APP_BUFSIZE - sizeof(struct dlm_rcom);
- error = create_rcom_stateless(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen,
- &rc, &msg);
+ error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen,
+ &rc, &mh, seq);
if (error)
return;
rc->rc_id = rc_in->rc_id;
@@ -350,10 +355,10 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in)
dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
nodeid);
- send_rcom_stateless(msg, rc);
+ send_rcom(mh, rc);
}
-int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
@@ -361,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
int error;
error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length,
- &rc, &mh);
+ &rc, &mh, seq);
if (error)
goto out;
memcpy(rc->rc_buf, r->res_name, r->res_length);
@@ -372,7 +377,8 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid)
return error;
}
-static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+static void receive_rcom_lookup(struct dlm_ls *ls,
+ const struct dlm_rcom *rc_in, uint64_t seq)
{
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
@@ -387,7 +393,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
return;
}
- error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh);
+ error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh,
+ seq);
if (error)
return;
@@ -402,7 +409,8 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in)
send_rcom(mh, rc);
}
-static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+static void receive_rcom_lookup_reply(struct dlm_ls *ls,
+ const struct dlm_rcom *rc_in)
{
dlm_recover_master_reply(ls, rc_in);
}
@@ -437,7 +445,7 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
}
-int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq)
{
struct dlm_ls *ls = r->res_ls;
struct dlm_rcom *rc;
@@ -448,7 +456,8 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
if (lkb->lkb_lvbptr)
len += ls->ls_lvblen;
- error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh);
+ error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh,
+ seq);
if (error)
goto out;
@@ -462,23 +471,28 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
}
/* needs at least dlm_rcom + rcom_lock */
-static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
+static void receive_rcom_lock(struct dlm_ls *ls, const struct dlm_rcom *rc_in,
+ uint64_t seq)
{
+ __le32 rl_remid, rl_result;
+ struct rcom_lock *rl;
struct dlm_rcom *rc;
struct dlm_mhandle *mh;
int error, nodeid = le32_to_cpu(rc_in->rc_header.h_nodeid);
- dlm_recover_master_copy(ls, rc_in);
+ dlm_recover_master_copy(ls, rc_in, &rl_remid, &rl_result);
error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY,
- sizeof(struct rcom_lock), &rc, &mh);
+ sizeof(struct rcom_lock), &rc, &mh, seq);
if (error)
return;
- /* We send back the same rcom_lock struct we received, but
- dlm_recover_master_copy() has filled in rl_remid and rl_result */
-
memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
+ rl = (struct rcom_lock *)rc->rc_buf;
+ /* set rl_remid and rl_result from dlm_recover_master_copy() */
+ rl->rl_remid = rl_remid;
+ rl->rl_result = rl_result;
+
rc->rc_id = rc_in->rc_id;
rc->rc_seq_reply = rc_in->rc_seq;
@@ -488,7 +502,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in)
/* If the lockspace doesn't exist then still send a status message
back; it's possible that it just doesn't have its global_id yet. */
-int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
+int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in)
{
struct dlm_rcom *rc;
struct rcom_config *rf;
@@ -566,7 +580,7 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in)
/* Called by dlm_recv; corresponds to dlm_receive_message() but special
recovery-only comms are sent through here. */
-void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
+void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc, int nodeid)
{
int lock_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_lock);
int stop, reply = 0, names = 0, lookup = 0, lock = 0;
@@ -620,21 +634,21 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
switch (rc->rc_type) {
case cpu_to_le32(DLM_RCOM_STATUS):
- receive_rcom_status(ls, rc);
+ receive_rcom_status(ls, rc, seq);
break;
case cpu_to_le32(DLM_RCOM_NAMES):
- receive_rcom_names(ls, rc);
+ receive_rcom_names(ls, rc, seq);
break;
case cpu_to_le32(DLM_RCOM_LOOKUP):
- receive_rcom_lookup(ls, rc);
+ receive_rcom_lookup(ls, rc, seq);
break;
case cpu_to_le32(DLM_RCOM_LOCK):
if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
goto Eshort;
- receive_rcom_lock(ls, rc);
+ receive_rcom_lock(ls, rc, seq);
break;
case cpu_to_le32(DLM_RCOM_STATUS_REPLY):
@@ -652,7 +666,7 @@ void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
case cpu_to_le32(DLM_RCOM_LOCK_REPLY):
if (le16_to_cpu(rc->rc_header.h_length) < lock_size)
goto Eshort;
- dlm_recover_process_copy(ls, rc);
+ dlm_recover_process_copy(ls, rc, seq);
break;
default:
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index 454d3c4814ab..765926ae0020 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -12,12 +12,15 @@
#ifndef __RCOM_DOT_H__
#define __RCOM_DOT_H__
-int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
-int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
-int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
-int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
-void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid);
-int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in);
+int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags,
+ uint64_t seq);
+int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,
+ int last_len, uint64_t seq);
+int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid, uint64_t seq);
+int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, uint64_t seq);
+void dlm_receive_rcom(struct dlm_ls *ls, const struct dlm_rcom *rc,
+ int nodeid);
+int dlm_send_ls_not_ready(int nodeid, const struct dlm_rcom *rc_in);
#endif
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 29d71a5018d4..53917c0aa3c0 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -93,7 +93,7 @@ void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
}
static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
- int save_slots)
+ int save_slots, uint64_t seq)
{
struct dlm_rcom *rc = ls->ls_recover_buf;
struct dlm_member *memb;
@@ -107,7 +107,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
goto out;
}
- error = dlm_rcom_status(ls, memb->nodeid, 0);
+ error = dlm_rcom_status(ls, memb->nodeid, 0, seq);
if (error)
goto out;
@@ -126,7 +126,7 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
}
static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
- uint32_t status_flags)
+ uint32_t status_flags, uint64_t seq)
{
struct dlm_rcom *rc = ls->ls_recover_buf;
int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
@@ -137,7 +137,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
goto out;
}
- error = dlm_rcom_status(ls, nodeid, status_flags);
+ error = dlm_rcom_status(ls, nodeid, status_flags, seq);
if (error)
break;
@@ -151,22 +151,22 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
return error;
}
-static int wait_status(struct dlm_ls *ls, uint32_t status)
+static int wait_status(struct dlm_ls *ls, uint32_t status, uint64_t seq)
{
uint32_t status_all = status << 1;
int error;
if (ls->ls_low_nodeid == dlm_our_nodeid()) {
- error = wait_status_all(ls, status, 0);
+ error = wait_status_all(ls, status, 0, seq);
if (!error)
dlm_set_recover_status(ls, status_all);
} else
- error = wait_status_low(ls, status_all, 0);
+ error = wait_status_low(ls, status_all, 0, seq);
return error;
}
-int dlm_recover_members_wait(struct dlm_ls *ls)
+int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq)
{
struct dlm_member *memb;
struct dlm_slot *slots;
@@ -180,7 +180,7 @@ int dlm_recover_members_wait(struct dlm_ls *ls)
}
if (ls->ls_low_nodeid == dlm_our_nodeid()) {
- error = wait_status_all(ls, DLM_RS_NODES, 1);
+ error = wait_status_all(ls, DLM_RS_NODES, 1, seq);
if (error)
goto out;
@@ -199,7 +199,8 @@ int dlm_recover_members_wait(struct dlm_ls *ls)
dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
}
} else {
- error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
+ error = wait_status_low(ls, DLM_RS_NODES_ALL,
+ DLM_RSF_NEED_SLOTS, seq);
if (error)
goto out;
@@ -209,19 +210,19 @@ int dlm_recover_members_wait(struct dlm_ls *ls)
return error;
}
-int dlm_recover_directory_wait(struct dlm_ls *ls)
+int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq)
{
- return wait_status(ls, DLM_RS_DIR);
+ return wait_status(ls, DLM_RS_DIR, seq);
}
-int dlm_recover_locks_wait(struct dlm_ls *ls)
+int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq)
{
- return wait_status(ls, DLM_RS_LOCKS);
+ return wait_status(ls, DLM_RS_LOCKS, seq);
}
-int dlm_recover_done_wait(struct dlm_ls *ls)
+int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq)
{
- return wait_status(ls, DLM_RS_DONE);
+ return wait_status(ls, DLM_RS_DONE, seq);
}
/*
@@ -441,7 +442,7 @@ static void set_new_master(struct dlm_rsb *r)
* equals our_nodeid below).
*/
-static int recover_master(struct dlm_rsb *r, unsigned int *count)
+static int recover_master(struct dlm_rsb *r, unsigned int *count, uint64_t seq)
{
struct dlm_ls *ls = r->res_ls;
int our_nodeid, dir_nodeid;
@@ -472,7 +473,7 @@ static int recover_master(struct dlm_rsb *r, unsigned int *count)
error = 0;
} else {
recover_idr_add(r);
- error = dlm_send_rcom_lookup(r, dir_nodeid);
+ error = dlm_send_rcom_lookup(r, dir_nodeid, seq);
}
(*count)++;
@@ -520,7 +521,7 @@ static int recover_master_static(struct dlm_rsb *r, unsigned int *count)
* the correct dir node.
*/
-int dlm_recover_masters(struct dlm_ls *ls)
+int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq)
{
struct dlm_rsb *r;
unsigned int total = 0;
@@ -542,7 +543,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
if (nodir)
error = recover_master_static(r, &count);
else
- error = recover_master(r, &count);
+ error = recover_master(r, &count, seq);
unlock_rsb(r);
cond_resched();
total++;
@@ -563,7 +564,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
return error;
}
-int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc)
{
struct dlm_rsb *r;
int ret_nodeid, new_master;
@@ -614,13 +615,14 @@ int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
* an equal number of replies then recovery for the rsb is done
*/
-static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
+static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head,
+ uint64_t seq)
{
struct dlm_lkb *lkb;
int error = 0;
list_for_each_entry(lkb, head, lkb_statequeue) {
- error = dlm_send_rcom_lock(r, lkb);
+ error = dlm_send_rcom_lock(r, lkb, seq);
if (error)
break;
r->res_recover_locks_count++;
@@ -629,7 +631,7 @@ static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head)
return error;
}
-static int recover_locks(struct dlm_rsb *r)
+static int recover_locks(struct dlm_rsb *r, uint64_t seq)
{
int error = 0;
@@ -637,13 +639,13 @@ static int recover_locks(struct dlm_rsb *r)
DLM_ASSERT(!r->res_recover_locks_count, dlm_dump_rsb(r););
- error = recover_locks_queue(r, &r->res_grantqueue);
+ error = recover_locks_queue(r, &r->res_grantqueue, seq);
if (error)
goto out;
- error = recover_locks_queue(r, &r->res_convertqueue);
+ error = recover_locks_queue(r, &r->res_convertqueue, seq);
if (error)
goto out;
- error = recover_locks_queue(r, &r->res_waitqueue);
+ error = recover_locks_queue(r, &r->res_waitqueue, seq);
if (error)
goto out;
@@ -656,7 +658,7 @@ static int recover_locks(struct dlm_rsb *r)
return error;
}
-int dlm_recover_locks(struct dlm_ls *ls)
+int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq)
{
struct dlm_rsb *r;
int error, count = 0;
@@ -677,7 +679,7 @@ int dlm_recover_locks(struct dlm_ls *ls)
goto out;
}
- error = recover_locks(r);
+ error = recover_locks(r, seq);
if (error) {
up_read(&ls->ls_root_sem);
goto out;
diff --git a/fs/dlm/recover.h b/fs/dlm/recover.h
index 235e0d25cd48..dbc51013ecad 100644
--- a/fs/dlm/recover.h
+++ b/fs/dlm/recover.h
@@ -15,13 +15,13 @@
int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls));
uint32_t dlm_recover_status(struct dlm_ls *ls);
void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status);
-int dlm_recover_members_wait(struct dlm_ls *ls);
-int dlm_recover_directory_wait(struct dlm_ls *ls);
-int dlm_recover_locks_wait(struct dlm_ls *ls);
-int dlm_recover_done_wait(struct dlm_ls *ls);
-int dlm_recover_masters(struct dlm_ls *ls);
-int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc);
-int dlm_recover_locks(struct dlm_ls *ls);
+int dlm_recover_members_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_directory_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_locks_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_done_wait(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_masters(struct dlm_ls *ls, uint64_t seq);
+int dlm_recover_master_reply(struct dlm_ls *ls, const struct dlm_rcom *rc);
+int dlm_recover_locks(struct dlm_ls *ls, uint64_t seq);
void dlm_recovered_lock(struct dlm_rsb *r);
int dlm_create_root_list(struct dlm_ls *ls);
void dlm_release_root_list(struct dlm_ls *ls);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 19da816cfb09..4d17491dea2f 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_set_recover_status(ls, DLM_RS_NODES);
- error = dlm_recover_members_wait(ls);
+ error = dlm_recover_members_wait(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_members_wait error %d", error);
goto fail;
@@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
* nodes their master rsb names that hash to us.
*/
- error = dlm_recover_directory(ls);
+ error = dlm_recover_directory(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_directory error %d", error);
goto fail;
@@ -111,7 +111,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_set_recover_status(ls, DLM_RS_DIR);
- error = dlm_recover_directory_wait(ls);
+ error = dlm_recover_directory_wait(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
goto fail;
@@ -145,7 +145,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
* departed nodes.
*/
- error = dlm_recover_masters(ls);
+ error = dlm_recover_masters(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_masters error %d", error);
goto fail;
@@ -155,7 +155,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
* Send our locks on remastered rsb's to the new masters.
*/
- error = dlm_recover_locks(ls);
+ error = dlm_recover_locks(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_locks error %d", error);
goto fail;
@@ -163,7 +163,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_set_recover_status(ls, DLM_RS_LOCKS);
- error = dlm_recover_locks_wait(ls);
+ error = dlm_recover_locks_wait(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
@@ -187,7 +187,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
*/
dlm_set_recover_status(ls, DLM_RS_LOCKS);
- error = dlm_recover_locks_wait(ls);
+ error = dlm_recover_locks_wait(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
@@ -206,7 +206,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
dlm_set_recover_status(ls, DLM_RS_DONE);
- error = dlm_recover_done_wait(ls);
+ error = dlm_recover_done_wait(ls, rv->seq);
if (error) {
log_rinfo(ls, "dlm_recover_done_wait error %d", error);
goto fail;
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 8be2893ad15b..892d6ca21e74 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -30,7 +30,8 @@ struct rq_entry {
* lockspace is enabled on some while still suspended on others.
*/
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid,
+ const struct dlm_message *ms)
{
struct rq_entry *e;
int length = le16_to_cpu(ms->m_header.h_length) -
diff --git a/fs/dlm/requestqueue.h b/fs/dlm/requestqueue.h
index 4e403469a845..42bfe23ceabe 100644
--- a/fs/dlm/requestqueue.h
+++ b/fs/dlm/requestqueue.h
@@ -11,7 +11,8 @@
#ifndef __REQUESTQUEUE_DOT_H__
#define __REQUESTQUEUE_DOT_H__
-void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms);
+void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid,
+ const struct dlm_message *ms);
int dlm_process_requestqueue(struct dlm_ls *ls);
void dlm_wait_requestqueue(struct dlm_ls *ls);
void dlm_purge_requestqueue(struct dlm_ls *ls);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index e619c31b6bd9..b9575957a7c2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -10,6 +10,7 @@
#include <linux/writeback.h>
#include <linux/sysctl.h>
#include <linux/gfp.h>
+#include <linux/swap.h>
#include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */
@@ -59,6 +60,7 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
static int stfu;
if (sysctl_drop_caches & 1) {
+ lru_add_drain_all();
iterate_supers(drop_pagecache_sb, NULL);
count_vm_event(DROP_PAGECACHE);
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c16f0d660cb7..03bd55069d86 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -441,10 +441,10 @@ int ecryptfs_encrypt_page(struct page *page)
}
lower_offset = lower_offset_for_page(crypt_stat, page);
- enc_extent_virt = kmap(enc_extent_page);
+ enc_extent_virt = kmap_local_page(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
PAGE_SIZE);
- kunmap(enc_extent_page);
+ kunmap_local(enc_extent_virt);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
"Error attempting to write lower page; rc = [%d]\n",
@@ -490,10 +490,10 @@ int ecryptfs_decrypt_page(struct page *page)
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
lower_offset = lower_offset_for_page(crypt_stat, page);
- page_virt = kmap(page);
+ page_virt = kmap_local_page(page);
rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
ecryptfs_inode);
- kunmap(page);
+ kunmap_local(page_virt);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
"Error attempting to read lower page; rc = [%d]\n",
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 83274915ba6d..992d9c7e64ae 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -148,7 +148,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
}
fsstack_copy_attr_times(dir, lower_dir);
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
out_unlock:
dput(lower_dentry);
inode_unlock(lower_dir);
@@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
mount_crypt_stat = &ecryptfs_superblock_to_private(
dentry->d_sb)->mount_crypt_stat;
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat);
if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
char *target;
size_t targetsiz;
@@ -1011,7 +1011,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask,
+ d_inode(dentry), stat);
stat->blocks = lower_stat.blocks;
}
return rc;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 373c3e5747e6..e2483acc4366 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -125,7 +125,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_atomic(page);
+ page_virt = kmap_local_page(page);
memset(page_virt, 0, PAGE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
@@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
crypt_stat,
&written);
}
- kunmap_atomic(page_virt);
+ kunmap_local(page_virt);
flush_dcache_page(page);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
@@ -255,7 +255,6 @@ out:
* @mapping: The eCryptfs object
* @pos: The file offset at which to start writing
* @len: Length of the write
- * @flags: Various flags
* @pagep: Pointer to return the page
* @fsdata: Pointer to return fs data (unused)
*
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 60bdcaddcbe5..3458f153a588 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
+ offset_in_page);
- virt = kmap(page_for_lower);
+ virt = kmap_local_page(page_for_lower);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
if (rc > 0)
rc = 0;
- kunmap(page_for_lower);
+ kunmap_local(virt);
return rc;
}
@@ -140,7 +140,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_atomic(ecryptfs_page);
+ ecryptfs_page_virt = kmap_local_page(ecryptfs_page);
/*
* pos: where we're now writing, offset: where the request was
@@ -163,7 +163,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
(data + data_offset), num_bytes);
data_offset += num_bytes;
}
- kunmap_atomic(ecryptfs_page_virt);
+ kunmap_local(ecryptfs_page_virt);
flush_dcache_page(ecryptfs_page);
SetPageUptodate(ecryptfs_page);
unlock_page(ecryptfs_page);
@@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
int rc;
offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
- virt = kmap(page_for_ecryptfs);
+ virt = kmap_local_page(page_for_ecryptfs);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
rc = 0;
- kunmap(page_for_ecryptfs);
+ kunmap_local(virt);
flush_dcache_page(page_for_ecryptfs);
return rc;
}
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index d57ee15874f9..59b52718a3a2 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file,
} else {
inode_lock(inode);
i_size_write(inode, datasize + sizeof(attributes));
- inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
inode_unlock(inode);
}
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index b973a2c03dde..db9231f0e77b 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -25,7 +25,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb,
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_flags = is_removable ? 0 : S_IMMUTABLE;
switch (mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index e028fafa04f3..996271473609 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -32,10 +32,16 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 storage_space, remaining_space, max_variable_size;
efi_status_t status;
- status = efivar_query_variable_info(attr, &storage_space, &remaining_space,
- &max_variable_size);
- if (status != EFI_SUCCESS)
- return efi_status_to_err(status);
+ /* Some UEFI firmware does not implement QueryVariableInfo() */
+ storage_space = remaining_space = 0;
+ if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) {
+ status = efivar_query_variable_info(attr, &storage_space,
+ &remaining_space,
+ &max_variable_size);
+ if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED)
+ pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n",
+ status);
+ }
/*
* This is not a normal filesystem, so no point in pretending it has a block
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 2df1bac8b375..0833e533df9d 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -2,6 +2,7 @@
config EFS_FS
tristate "EFS file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
EFS is an older file system used for non-ISO9660 CD-ROMs and hard
disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index 13a4d9622633..918d2b9abb76 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
- * Copyright (c) 1999 Al Smith
+ * Copyright (c) 1999 Al Smith, <Al.Smith@aeschi.ch.eu.org>
*
* Portions derived from work (c) 1995,1996 Christian Vogelgsang.
* Portions derived from IRIX header files (c) 1988 Silicon Graphics
@@ -19,9 +19,6 @@
#define EFS_VERSION "1.0a"
-static const char cprt[] = "EFS: "EFS_VERSION" - (c) 1999 Al Smith <Al.Smith@aeschi.ch.eu.org>";
-
-
/* 1 block is 512 bytes */
#define EFS_BLOCKSIZE_BITS 9
#define EFS_BLOCKSIZE (1 << EFS_BLOCKSIZE_BITS)
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 3ba94bb005a6..3789d22ba501 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -105,8 +105,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
inode->i_size = be32_to_cpu(efs_inode->di_size);
inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime);
inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime);
- inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0);
+ inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
/* this is the number of blocks in the file */
if (inode->i_size == 0) {
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index f259d92c9720..f6dc961e6c2b 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -38,6 +38,7 @@ config EROFS_FS_DEBUG
config EROFS_FS_XATTR
bool "EROFS extended attributes"
depends on EROFS_FS
+ select XXHASH
default y
help
Extended attributes are name:value pairs associated with inodes by
@@ -99,6 +100,21 @@ config EROFS_FS_ZIP_LZMA
If unsure, say N.
+config EROFS_FS_ZIP_DEFLATE
+ bool "EROFS DEFLATE compressed data support"
+ depends on EROFS_FS_ZIP
+ select ZLIB_INFLATE
+ help
+ Saying Y here includes support for reading EROFS file systems
+ containing DEFLATE compressed data. It gives better compression
+ ratios than the default LZ4 format, while it costs more CPU
+ overhead.
+
+ DEFLATE support is an experimental feature for now and so most
+ file systems will be readable without selecting this option.
+
+ If unsure, say N.
+
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support"
depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index a3a98fc3e481..994d0b9deddf 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -5,4 +5,5 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index b1b846504027..349c3316ae6b 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -94,4 +94,6 @@ extern const struct z_erofs_decompressor erofs_decompressors[];
/* prototypes for specific algorithms */
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool);
+int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index db5e4b7636ec..0c2c99c58b5e 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -413,14 +413,14 @@ const struct address_space_operations erofs_raw_access_aops = {
#ifdef CONFIG_FS_DAX
static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+ unsigned int order)
{
- return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+ return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops);
}
static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
{
- return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+ return erofs_dax_huge_fault(vmf, 0);
}
static const struct vm_operations_struct erofs_dax_vm_ops = {
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index cfad1eac7fd9..332ec5f74002 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -379,4 +379,10 @@ const struct z_erofs_decompressor erofs_decompressors[] = {
.name = "lzma"
},
#endif
+#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
+ [Z_EROFS_COMPRESSION_DEFLATE] = {
+ .decompress = z_erofs_deflate_decompress,
+ .name = "deflate"
+ },
+#endif
};
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
new file mode 100644
index 000000000000..19e5bdeb30b6
--- /dev/null
+++ b/fs/erofs/decompressor_deflate.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/module.h>
+#include <linux/zlib.h>
+#include "compress.h"
+
+struct z_erofs_deflate {
+ struct z_erofs_deflate *next;
+ struct z_stream_s z;
+ u8 bounce[PAGE_SIZE];
+};
+
+static DEFINE_SPINLOCK(z_erofs_deflate_lock);
+static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms;
+static struct z_erofs_deflate *z_erofs_deflate_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq);
+
+module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444);
+
+void z_erofs_deflate_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ continue;
+ }
+ z_erofs_deflate_head = NULL;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ while (strm) {
+ struct z_erofs_deflate *n = strm->next;
+
+ vfree(strm->z.workspace);
+ kfree(strm);
+ --z_erofs_deflate_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int __init z_erofs_deflate_init(void)
+{
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_deflate_nstrms)
+ z_erofs_deflate_nstrms = num_possible_cpus();
+
+ for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms;
+ ++z_erofs_deflate_avail_strms) {
+ struct z_erofs_deflate *strm;
+
+ strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+ if (!strm)
+ goto out_failed;
+
+ /* XXX: in-kernel zlib cannot shrink windowbits currently */
+ strm->z.workspace = vmalloc(zlib_inflate_workspacesize());
+ if (!strm->z.workspace) {
+ kfree(strm);
+ goto out_failed;
+ }
+
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ }
+ return 0;
+
+out_failed:
+ pr_err("failed to allocate zlib workspace\n");
+ z_erofs_deflate_exit();
+ return -ENOMEM;
+}
+
+int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_deflate_cfgs *dfl, int size)
+{
+ if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) {
+ erofs_err(sb, "invalid deflate cfgs, size=%u", size);
+ return -EINVAL;
+ }
+
+ if (dfl->windowbits > MAX_WBITS) {
+ erofs_err(sb, "unsupported windowbits %u", dfl->windowbits);
+ return -EOPNOTSUPP;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!");
+ return 0;
+}
+
+int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ struct super_block *sb = rq->sb;
+ unsigned int insz, outsz, pofs;
+ struct z_erofs_deflate *strm;
+ u8 *kin, *kout = NULL;
+ bool bounced = false;
+ int no = -1, ni = 0, j = 0, zerr, err;
+
+ /* 1. get the exact DEFLATE compressed size */
+ kin = kmap_local_page(*rq->in);
+ err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ sb->s_blocksize - rq->pageofs_in));
+ if (err) {
+ kunmap_local(kin);
+ return err;
+ }
+
+ /* 2. get an available DEFLATE context */
+again:
+ spin_lock(&z_erofs_deflate_lock);
+ strm = z_erofs_deflate_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_deflate_lock);
+ wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head));
+ goto again;
+ }
+ z_erofs_deflate_head = strm->next;
+ spin_unlock(&z_erofs_deflate_lock);
+
+ /* 3. multi-call decompress */
+ insz = rq->inputsize;
+ outsz = rq->outputsize;
+ zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS);
+ if (zerr != Z_OK) {
+ err = -EIO;
+ goto failed_zinit;
+ }
+
+ pofs = rq->pageofs_out;
+ strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in);
+ insz -= strm->z.avail_in;
+ strm->z.next_in = kin + rq->pageofs_in;
+ strm->z.avail_out = 0;
+
+ while (1) {
+ if (!strm->z.avail_out) {
+ if (++no >= nrpages_out || !outsz) {
+ erofs_err(sb, "insufficient space for decompressed data");
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ if (kout)
+ kunmap_local(kout);
+ strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
+ outsz -= strm->z.avail_out;
+ if (!rq->out[no]) {
+ rq->out[no] = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(rq->out[no],
+ Z_EROFS_SHORTLIVED_PAGE);
+ }
+ kout = kmap_local_page(rq->out[no]);
+ strm->z.next_out = kout + pofs;
+ pofs = 0;
+ }
+
+ if (!strm->z.avail_in && insz) {
+ if (++ni >= nrpages_in) {
+ erofs_err(sb, "invalid compressed data");
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ if (kout) { /* unlike kmap(), take care of the orders */
+ j = strm->z.next_out - kout;
+ kunmap_local(kout);
+ }
+ kunmap_local(kin);
+ strm->z.avail_in = min_t(u32, insz, PAGE_SIZE);
+ insz -= strm->z.avail_in;
+ kin = kmap_local_page(rq->in[ni]);
+ strm->z.next_in = kin;
+ bounced = false;
+ if (kout) {
+ kout = kmap_local_page(rq->out[no]);
+ strm->z.next_out = kout + j;
+ }
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Or use short-lived pages from the
+ * on-stack pagepool where pages share among the same request
+ * and not _all_ inplace I/O pages are needed to be doubled.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in);
+ strm->z.next_in = strm->bounce;
+ bounced = true;
+ }
+
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+
+ zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH);
+ if (zerr != Z_OK || !(outsz + strm->z.avail_out)) {
+ if (zerr == Z_OK && rq->partial_decoding)
+ break;
+ if (zerr == Z_STREAM_END && !outsz)
+ break;
+ erofs_err(sb, "failed to decompress %d in[%u] out[%u]",
+ zerr, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+
+ if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
+ err = -EIO;
+ if (kout)
+ kunmap_local(kout);
+failed_zinit:
+ kunmap_local(kin);
+ /* 4. push back DEFLATE stream context to the global list */
+ spin_lock(&z_erofs_deflate_lock);
+ strm->next = z_erofs_deflate_head;
+ z_erofs_deflate_head = strm;
+ spin_unlock(&z_erofs_deflate_lock);
+ wake_up(&z_erofs_deflate_wq);
+ return err;
+}
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 73091fbe3ea4..dee10d22ada9 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -217,9 +217,12 @@ again:
strm->buf.out_size = min_t(u32, outlen,
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
- if (!rq->out[no] && rq->fillgaps) /* deduped */
+ if (!rq->out[no] && rq->fillgaps) { /* deduped */
rq->out[no] = erofs_allocpage(pagepool,
GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(rq->out[no],
+ Z_EROFS_SHORTLIVED_PAGE);
+ }
if (rq->out[no])
strm->buf.out = kmap(rq->out[no]) + pageofs;
pageofs = 0;
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 2c7b16e340fe..a03ec70ba6f2 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -13,6 +13,7 @@
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
+#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -81,7 +82,8 @@ struct erofs_super_block {
__u8 xattr_prefix_count; /* # of long xattr name prefixes */
__le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */
- __u8 reserved2[24];
+ __u8 xattr_filter_reserved; /* reserved for xattr name filter */
+ __u8 reserved2[23];
};
/*
@@ -200,7 +202,7 @@ struct erofs_inode_extended {
* for read-only fs, no need to introduce h_refcount
*/
struct erofs_xattr_ibody_header {
- __le32 h_reserved;
+ __le32 h_name_filter; /* bit value 1 indicates not-present */
__u8 h_shared_count;
__u8 h_reserved2[7];
__le32 h_shared_xattrs[]; /* shared xattr id array */
@@ -221,6 +223,10 @@ struct erofs_xattr_ibody_header {
#define EROFS_XATTR_LONG_PREFIX 0x80
#define EROFS_XATTR_LONG_PREFIX_MASK 0x7f
+#define EROFS_XATTR_FILTER_BITS 32
+#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX
+#define EROFS_XATTR_FILTER_SEED 0x25BBE08F
+
/* xattr entry (for both inline & shared xattrs) */
struct erofs_xattr_entry {
__u8 e_name_len; /* length of name */
@@ -289,6 +295,7 @@ struct erofs_dirent {
enum {
Z_EROFS_COMPRESSION_LZ4 = 0,
Z_EROFS_COMPRESSION_LZMA = 1,
+ Z_EROFS_COMPRESSION_DEFLATE = 2,
Z_EROFS_COMPRESSION_MAX
};
#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
@@ -309,6 +316,12 @@ struct z_erofs_lzma_cfgs {
#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
+/* 6 bytes (+ length field = 8 bytes) */
+struct z_erofs_deflate_cfgs {
+ u8 windowbits; /* 8..15 for DEFLATE */
+ u8 reserved[5];
+} __packed;
+
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index e12592727a54..edc8ec7581b8 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -105,8 +105,8 @@ static void *erofs_read_inode(struct erofs_buf *buf,
set_nlink(inode, le32_to_cpu(die->i_nlink));
/* extended inode has its own timestamp */
- inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
- inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
+ inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
+ le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
@@ -148,8 +148,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
set_nlink(inode, le16_to_cpu(dic->i_nlink));
/* use build time for compact inodes */
- inode->i_ctime.tv_sec = sbi->build_time;
- inode->i_ctime.tv_nsec = sbi->build_time_nsec;
+ inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
if (erofs_inode_is_data_compressed(vi->datalayout))
@@ -176,10 +175,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
- inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
- inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
+ inode->i_mtime = inode->i_atime = inode_get_ctime(inode);
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
@@ -373,7 +369,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 36e32fa542f0..4ff88d0dd980 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -151,6 +151,7 @@ struct erofs_sb_info {
u32 xattr_prefix_start;
u8 xattr_prefix_count;
struct erofs_xattr_prefix_item *xattr_prefixes;
+ unsigned int xattr_filter_reserved;
#endif
u16 device_id_mask; /* valid bits of device id to be used */
@@ -251,6 +252,7 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
@@ -270,6 +272,7 @@ struct erofs_inode {
unsigned char inode_isize;
unsigned int xattr_isize;
+ unsigned int xattr_name_filter;
unsigned int xattr_shared_count;
unsigned int *xattr_shared_xattrs;
@@ -519,6 +522,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */
+#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE
+int __init z_erofs_deflate_init(void);
+void z_erofs_deflate_exit(void);
+int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_deflate_cfgs *dfl, int size);
+#else
+static inline int z_erofs_deflate_init(void) { return 0; }
+static inline int z_erofs_deflate_exit(void) { return 0; }
+static inline int z_erofs_load_deflate_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_deflate_cfgs *dfl, int size) {
+ if (dfl) {
+ erofs_err(sb, "deflate algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */
+
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 566f68ddfa36..3700af9ee173 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -19,10 +19,8 @@
#include <trace/events/erofs.h>
static struct kmem_cache *erofs_inode_cachep __read_mostly;
-struct file_system_type erofs_fs_type;
-void _erofs_err(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -32,12 +30,11 @@ void _erofs_err(struct super_block *sb, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf);
+ pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
va_end(args);
}
-void _erofs_info(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -102,11 +99,9 @@ static void erofs_free_inode(struct inode *inode)
{
struct erofs_inode *vi = EROFS_I(inode);
- /* be careful of RCU symlink path */
if (inode->i_op == &erofs_fast_symlink_iops)
kfree(inode->i_link);
kfree(vi->xattr_shared_xattrs);
-
kmem_cache_free(erofs_inode_cachep, vi);
}
@@ -119,8 +114,7 @@ static bool check_layout_compatibility(struct super_block *sb,
/* check if current kernel meets all mandatory requirements */
if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
- erofs_err(sb,
- "unidentified incompatible feature %x, please upgrade kernel version",
+ erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel",
feature & ~EROFS_ALL_FEATURE_INCOMPAT);
return false;
}
@@ -201,6 +195,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb,
case Z_EROFS_COMPRESSION_LZMA:
ret = z_erofs_load_lzma_config(sb, dsb, data, size);
break;
+ case Z_EROFS_COMPRESSION_DEFLATE:
+ ret = z_erofs_load_deflate_config(sb, dsb, data, size);
+ break;
default:
DBG_BUGON(1);
ret = -EFAULT;
@@ -238,7 +235,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
return PTR_ERR(ptr);
dis = ptr + erofs_blkoff(sb, *pos);
- if (!dif->path) {
+ if (!sbi->devs->flatdev && !dif->path) {
if (!dis->tag[0]) {
erofs_err(sb, "empty device tag @ pos %llu", *pos);
return -EINVAL;
@@ -388,6 +385,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start);
sbi->xattr_prefix_count = dsb->xattr_prefix_count;
+ sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
sbi->root_nid = le16_to_cpu(dsb->root_nid);
@@ -420,16 +418,11 @@ static int erofs_read_superblock(struct super_block *sb)
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
- if (erofs_sb_has_fragments(sbi))
- erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
- if (erofs_sb_has_dedupe(sbi))
- erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
out:
erofs_put_metabuf(&buf);
return ret;
}
-/* set up default EROFS parameters */
static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
@@ -731,7 +724,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
xa_init(&sbi->managed_pslots);
#endif
- /* get the root inode */
inode = erofs_iget(sb, ROOT_NID(sbi));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -748,7 +740,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
return -ENOMEM;
erofs_shrinker_register(sb);
- /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
if (IS_ERR(sbi->packed_inode)) {
@@ -881,10 +872,6 @@ static int erofs_init_fs_context(struct fs_context *fc)
return 0;
}
-/*
- * could be triggered after deactivate_locked_super()
- * is called, thus including umount and failed to initialize.
- */
static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi;
@@ -913,7 +900,6 @@ static void erofs_kill_sb(struct super_block *sb)
sb->s_fs_info = NULL;
}
-/* called when ->s_root is non-NULL */
static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
@@ -950,9 +936,9 @@ static int __init erofs_module_init(void)
erofs_check_ondisk_layout_definitions();
erofs_inode_cachep = kmem_cache_create("erofs_inode",
- sizeof(struct erofs_inode), 0,
- SLAB_RECLAIM_ACCOUNT,
- erofs_inode_init_once);
+ sizeof(struct erofs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+ erofs_inode_init_once);
if (!erofs_inode_cachep)
return -ENOMEM;
@@ -964,6 +950,10 @@ static int __init erofs_module_init(void)
if (err)
goto lzma_err;
+ err = z_erofs_deflate_init();
+ if (err)
+ goto deflate_err;
+
erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
@@ -984,6 +974,8 @@ fs_err:
sysfs_err:
z_erofs_exit_zip_subsystem();
zip_err:
+ z_erofs_deflate_exit();
+deflate_err:
z_erofs_lzma_exit();
lzma_err:
erofs_exit_shrinker();
@@ -1001,13 +993,13 @@ static void __exit erofs_module_exit(void)
erofs_exit_sysfs();
z_erofs_exit_zip_subsystem();
+ z_erofs_deflate_exit();
z_erofs_lzma_exit();
erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
erofs_pcpubuf_exit();
}
-/* get filesystem statistics */
static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 40178b6e0688..09d341675e89 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021-2022, Alibaba Cloud
*/
#include <linux/security.h>
+#include <linux/xxhash.h>
#include "xattr.h"
struct erofs_xattr_iter {
@@ -87,6 +88,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
}
ih = it.kaddr + erofs_blkoff(sb, it.pos);
+ vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter);
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
sizeof(uint), GFP_KERNEL);
@@ -392,7 +394,10 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
void *buffer, size_t buffer_size)
{
int ret;
+ unsigned int hashbit;
struct erofs_xattr_iter it;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
if (!name)
return -EINVAL;
@@ -401,6 +406,15 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
if (ret)
return ret;
+ /* reserved flag is non-zero if there's any change of on-disk format */
+ if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) {
+ hashbit = xxh32(name, strlen(name),
+ EROFS_XATTR_FILTER_SEED + index);
+ hashbit &= EROFS_XATTR_FILTER_BITS - 1;
+ if (vi->xattr_name_filter & (1U << hashbit))
+ return -ENOATTR;
+ }
+
it.index = index;
it.name = (struct qstr)QSTR_INIT(name, strlen(name));
if (it.name.len > EROFS_NAME_LEN)
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index de4f12152b62..036f610e044b 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -143,22 +143,17 @@ static inline void z_erofs_onlinepage_split(struct page *page)
atomic_inc((atomic_t *)&page->private);
}
-static inline void z_erofs_page_mark_eio(struct page *page)
+static void z_erofs_onlinepage_endio(struct page *page, int err)
{
- int orig;
+ int orig, v;
+
+ DBG_BUGON(!PagePrivate(page));
do {
orig = atomic_read((atomic_t *)&page->private);
- } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
- orig | Z_EROFS_PAGE_EIO) != orig);
-}
-
-static inline void z_erofs_onlinepage_endio(struct page *page)
-{
- unsigned int v;
+ v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
+ } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
- DBG_BUGON(!PagePrivate(page));
- v = atomic_dec_return((atomic_t *)&page->private);
if (!(v & ~Z_EROFS_PAGE_EIO)) {
set_page_private(page, 0);
ClearPagePrivate(page);
@@ -507,19 +502,17 @@ enum z_erofs_pclustermode {
*/
Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
- * The current collection has been linked with the owned chain, and
- * could also be linked with the remaining collections, which means
- * if the processing page is the tail page of the collection, thus
- * the current collection can safely use the whole page (since
- * the previous collection is under control) for in-place I/O, as
- * illustrated below:
- * ________________________________________________________________
- * | tail (partial) page | head (partial) page |
- * | (of the current cl) | (of the previous collection) |
- * | | |
- * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________|
+ * The pcluster was just linked to a decompression chain by us. It can
+ * also be linked with the remaining pclusters, which means if the
+ * processing page is the tail page of a pcluster, this pcluster can
+ * safely use the whole page (since the previous pcluster is within the
+ * same chain) for in-place I/O, as illustrated below:
+ * ___________________________________________________
+ * | tail (partial) page | head (partial) page |
+ * | (of the current pcl) | (of the previous pcl) |
+ * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
*
- * [ (*) the above page can be used as inplace I/O. ]
+ * [ (*) the page above can be used as inplace I/O. ]
*/
Z_EROFS_PCLUSTER_FOLLOWED,
};
@@ -535,8 +528,6 @@ struct z_erofs_decompress_frontend {
z_erofs_next_pcluster_t owned_head;
enum z_erofs_pclustermode mode;
- /* used for applying cache strategy on the fly */
- bool backmost;
erofs_off_t headoffset;
/* a pointer used to pick up inplace I/O pages */
@@ -545,7 +536,7 @@ struct z_erofs_decompress_frontend {
#define DECOMPRESS_FRONTEND_INIT(__i) { \
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED }
static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
{
@@ -554,7 +545,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
return false;
- if (fe->backmost)
+ if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
return true;
if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
@@ -851,9 +842,11 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
+static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
+ struct super_block *sb = fe->inode->i_sb;
+ erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
struct erofs_workgroup *grp = NULL;
int ret;
@@ -863,8 +856,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
if (!(map->m_flags & EROFS_MAP_META)) {
- grp = erofs_find_workgroup(fe->inode->i_sb,
- map->m_pa >> PAGE_SHIFT);
+ grp = erofs_find_workgroup(sb, blknr);
} else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
return -EFSCORRUPTED;
@@ -883,9 +875,26 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
} else if (ret) {
return ret;
}
+
z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
- /* since file-backed online pages are traversed in reverse order */
+ if (!z_erofs_is_inline_pcluster(fe->pcl)) {
+ /* bind cache first when cached decompression is preferred */
+ z_erofs_bind_cache(fe);
+ } else {
+ void *mptr;
+
+ mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
+ if (IS_ERR(mptr)) {
+ ret = PTR_ERR(mptr);
+ erofs_err(sb, "failed to get inline data %d", ret);
+ return ret;
+ }
+ get_page(map->buf.page);
+ WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
+ }
+ /* file-backed inplace I/O pages are traversed in reverse order */
fe->icur = z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -908,12 +917,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
-static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
+static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
{
struct z_erofs_pcluster *pcl = fe->pcl;
if (!pcl)
- return false;
+ return;
z_erofs_bvec_iter_end(&fe->biter);
mutex_unlock(&pcl->lock);
@@ -929,37 +938,29 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
erofs_workgroup_put(&pcl->obj);
fe->pcl = NULL;
- return true;
}
-static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
- struct page *page, unsigned int pageofs,
- unsigned int len)
+static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
+ unsigned int cur, unsigned int end, erofs_off_t pos)
{
- struct super_block *sb = inode->i_sb;
- struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+ struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- u8 *src, *dst;
- unsigned int i, cnt;
+ unsigned int cnt;
+ u8 *src;
if (!packed_inode)
return -EFSCORRUPTED;
buf.inode = packed_inode;
- pos += EROFS_I(inode)->z_fragmentoff;
- for (i = 0; i < len; i += cnt) {
- cnt = min_t(unsigned int, len - i,
+ for (; cur < end; cur += cnt, pos += cnt) {
+ cnt = min_t(unsigned int, end - cur,
sb->s_blocksize - erofs_blkoff(sb, pos));
src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
}
-
- dst = kmap_local_page(page);
- memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt);
- kunmap_local(dst);
- pos += cnt;
+ memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
}
erofs_put_metabuf(&buf);
return 0;
@@ -972,94 +973,60 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
bool tight = true, exclusive;
- unsigned int cur, end, spiltted;
+ unsigned int cur, end, len, split;
int err = 0;
- /* register locked file pages as online pages in pack */
z_erofs_onlinepage_init(page);
- spiltted = 0;
+ split = 0;
end = PAGE_SIZE;
repeat:
- cur = end - 1;
-
- if (offset + cur < map->m_la ||
- offset + cur >= map->m_la + map->m_llen) {
- if (z_erofs_collector_end(fe))
- fe->backmost = false;
- map->m_la = offset + cur;
+ if (offset + end - 1 < map->m_la ||
+ offset + end - 1 >= map->m_la + map->m_llen) {
+ z_erofs_pcluster_end(fe);
+ map->m_la = offset + end - 1;
map->m_llen = 0;
err = z_erofs_map_blocks_iter(inode, map, 0);
if (err)
goto out;
- } else {
- if (fe->pcl)
- goto hitted;
- /* didn't get a valid pcluster previously (very rare) */
}
- if (!(map->m_flags & EROFS_MAP_MAPPED) ||
- map->m_flags & EROFS_MAP_FRAGMENT)
- goto hitted;
-
- err = z_erofs_collector_begin(fe);
- if (err)
- goto out;
-
- if (z_erofs_is_inline_pcluster(fe->pcl)) {
- void *mp;
-
- mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
- erofs_blknr(inode->i_sb, map->m_pa),
- EROFS_NO_KMAP);
- if (IS_ERR(mp)) {
- err = PTR_ERR(mp);
- erofs_err(inode->i_sb,
- "failed to get inline page, err %d", err);
- goto out;
- }
- get_page(fe->map.buf.page);
- WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
- fe->map.buf.page);
- fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
- } else {
- /* bind cache first when cached decompression is preferred */
- z_erofs_bind_cache(fe);
- }
-hitted:
- /*
- * Ensure the current partial page belongs to this submit chain rather
- * than other concurrent submit chains or the noio(bypass) chain since
- * those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or bvpage (should be processed in a strict order.)
- */
- tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ cur = offset > map->m_la ? 0 : map->m_la - offset;
+ /* bump split parts first to avoid several separate cases */
+ ++split;
- cur = end - min_t(erofs_off_t, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
+ tight = false;
goto next_part;
}
+
if (map->m_flags & EROFS_MAP_FRAGMENT) {
- unsigned int pageofs, skip, len;
+ erofs_off_t fpos = offset + cur - map->m_la;
- if (offset > map->m_la) {
- pageofs = 0;
- skip = offset - map->m_la;
- } else {
- pageofs = map->m_la & ~PAGE_MASK;
- skip = 0;
- }
- len = min_t(unsigned int, map->m_llen - skip, end - cur);
- err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
+ len = min_t(unsigned int, map->m_llen - fpos, end - cur);
+ err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
+ EROFS_I(inode)->z_fragmentoff + fpos);
if (err)
goto out;
- ++spiltted;
tight = false;
goto next_part;
}
- exclusive = (!cur && (!spiltted || tight));
+ if (!fe->pcl) {
+ err = z_erofs_pcluster_begin(fe);
+ if (err)
+ goto out;
+ }
+
+ /*
+ * Ensure the current partial page belongs to this submit chain rather
+ * than other concurrent submit chains or the noio(bypass) chain since
+ * those chains are handled asynchronously thus the page cannot be used
+ * for inplace I/O or bvpage (should be processed in a strict order.)
+ */
+ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
+ exclusive = (!cur && ((split <= 1) || tight));
if (cur)
tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
@@ -1072,8 +1039,6 @@ hitted:
goto out;
z_erofs_onlinepage_split(page);
- /* bump up the number of spiltted parts of a page */
- ++spiltted;
if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
fe->pcl->multibases = true;
if (fe->pcl->length < offset + end - map->m_la) {
@@ -1094,9 +1059,7 @@ next_part:
goto repeat;
out:
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
+ z_erofs_onlinepage_endio(page, err);
return err;
}
@@ -1199,9 +1162,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
cur += len;
}
kunmap_local(dst);
- if (err)
- z_erofs_page_mark_eio(bvi->bvec.page);
- z_erofs_onlinepage_endio(bvi->bvec.page);
+ z_erofs_onlinepage_endio(bvi->bvec.page, err);
list_del(p);
kfree(bvi);
}
@@ -1372,9 +1333,7 @@ out:
/* recycle all individual short-lived pages */
if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
- if (err)
- z_erofs_page_mark_eio(page);
- z_erofs_onlinepage_endio(page);
+ z_erofs_onlinepage_endio(page, err);
}
if (be->decompressed_pages != be->onstack_pages)
@@ -1410,7 +1369,10 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
owned = READ_ONCE(be.pcl->next);
z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
- erofs_workgroup_put(&be.pcl->obj);
+ if (z_erofs_is_inline_pcluster(be.pcl))
+ z_erofs_free_pcluster(be.pcl);
+ else
+ erofs_workgroup_put(&be.pcl->obj);
}
}
@@ -1848,15 +1810,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
if (page) {
- if (PageUptodate(page)) {
+ if (PageUptodate(page))
unlock_page(page);
- } else {
- err = z_erofs_do_read_page(f, page);
- if (err)
- erofs_err(inode->i_sb,
- "readmore error at page %lu @ nid %llu",
- index, EROFS_I(inode)->nid);
- }
+ else
+ (void)z_erofs_do_read_page(f, page);
put_page(page);
}
@@ -1868,25 +1825,25 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
- struct inode *const inode = page->mapping->host;
+ struct inode *const inode = folio->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
int err;
- trace_erofs_readpage(page, false);
- f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+ trace_erofs_read_folio(folio, false);
+ f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, page);
+ err = z_erofs_do_read_page(&f, &folio->page);
z_erofs_pcluster_readmore(&f, NULL, false);
- (void)z_erofs_collector_end(&f);
+ z_erofs_pcluster_end(&f);
/* if some compressed cluster ready, need submit them anyway */
z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
- if (err)
- erofs_err(inode->i_sb, "failed to read, err [%d]", err);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
+ err, folio->index, EROFS_I(inode)->nid);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
@@ -1898,38 +1855,35 @@ static void z_erofs_readahead(struct readahead_control *rac)
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- struct page *head = NULL, *page;
- unsigned int nr_pages;
+ struct folio *head = NULL, *folio;
+ unsigned int nr_folios;
+ int err;
f.headoffset = readahead_pos(rac);
z_erofs_pcluster_readmore(&f, rac, true);
- nr_pages = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ nr_folios = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
- while ((page = readahead_page(rac))) {
- set_page_private(page, (unsigned long)head);
- head = page;
+ while ((folio = readahead_folio(rac))) {
+ folio->private = head;
+ head = folio;
}
+ /* traverse in reverse order for best metadata I/O performance */
while (head) {
- struct page *page = head;
- int err;
-
- /* traversal in reverse order */
- head = (void *)page_private(page);
+ folio = head;
+ head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, page);
- if (err)
- erofs_err(inode->i_sb,
- "readahead error at page %lu @ nid %llu",
- page->index, EROFS_I(inode)->nid);
- put_page(page);
+ err = z_erofs_do_read_page(&f, &folio->page);
+ if (err && err != -EINTR)
+ erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
+ folio->index, EROFS_I(inode)->nid);
}
z_erofs_pcluster_readmore(&f, rac, false);
- (void)z_erofs_collector_end(&f);
+ z_erofs_pcluster_end(&f);
- z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true);
+ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&f.pagepool);
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 1909ddafd9c7..7b55111fd533 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -561,8 +561,9 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
- map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
- map->m_llen >= i_blocksize(inode))) {
+ (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA ||
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) &&
+ map->m_llen >= i_blocksize(inode))) {
err = z_erofs_get_extent_decompressedlen(&m);
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa36cd37351..33a918f9566c 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -189,7 +189,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{
lockdep_assert_held(&ctx->wqh.lock);
- *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+ *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
ctx->count -= *cnt;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4b1b3362f697..1d9a71a0c4c1 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -975,15 +975,11 @@ again:
static int ep_alloc(struct eventpoll **pep)
{
- int error;
- struct user_struct *user;
struct eventpoll *ep;
- user = get_current_user();
- error = -ENOMEM;
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
if (unlikely(!ep))
- goto free_uid;
+ return -ENOMEM;
mutex_init(&ep->mtx);
rwlock_init(&ep->lock);
@@ -992,16 +988,12 @@ static int ep_alloc(struct eventpoll **pep)
INIT_LIST_HEAD(&ep->rdllist);
ep->rbr = RB_ROOT_CACHED;
ep->ovflist = EP_UNACTIVE_PTR;
- ep->user = user;
+ ep->user = get_current_user();
refcount_set(&ep->refcount, 1);
*pep = ep;
return 0;
-
-free_uid:
- free_uid(user);
- return error;
}
/*
diff --git a/fs/exec.c b/fs/exec.c
index 1a827d55ba94..6518e33ea813 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
if (vma != vma_next(&vmi))
return -EFAULT;
+ vma_iter_prev_range(&vmi);
/*
* cover the whole range: [new_start, old_end)
*/
@@ -1276,8 +1277,8 @@ int begin_new_exec(struct linux_binprm * bprm)
/*
* Must be called _before_ exec_mmap() as bprm->mm is
- * not visible until then. This also enables the update
- * to be lockless.
+ * not visible until then. Doing it here also ensures
+ * we don't race against replace_mm_exe_file().
*/
retval = set_mm_exe_file(bprm->mm, bprm->file);
if (retval)
diff --git a/fs/exfat/Kconfig b/fs/exfat/Kconfig
index 147edeb04469..cbeca8e44d9b 100644
--- a/fs/exfat/Kconfig
+++ b/fs/exfat/Kconfig
@@ -2,6 +2,7 @@
config EXFAT_FS
tristate "exFAT filesystem support"
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 729ada9e26e8..f55498e5c23d 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -273,8 +273,6 @@ struct exfat_sb_info {
spinlock_t inode_hash_lock;
struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
-
- struct rcu_head rcu;
};
#define EXFAT_CACHE_VALID 0
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 3cbd270e0cba..32395ef686a2 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -22,7 +22,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size)
if (err)
return err;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
if (!IS_SYNC(inode))
@@ -232,7 +232,7 @@ int exfat_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_backing_inode(path->dentry);
struct exfat_inode_info *ei = EXFAT_I(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
exfat_truncate_atime(&stat->atime);
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ei->i_crtime.tv_sec;
@@ -290,7 +290,7 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_valid & ATTR_SIZE)
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
setattr_copy(&nop_mnt_idmap, inode, attr);
exfat_truncate_atime(&inode->i_atime);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 481dd338f2b8..13329baeafbc 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -355,7 +355,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
exfat_truncate(inode);
}
}
@@ -398,7 +398,7 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
exfat_write_failed(mapping, pos+len);
if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ei->attr |= ATTR_ARCHIVE;
mark_inode_dirty(inode);
}
@@ -577,7 +577,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9;
inode->i_mtime = info->mtime;
- inode->i_ctime = info->mtime;
+ inode_set_ctime_to_ts(inode, info->mtime);
ei->i_crtime = info->crtime;
inode->i_atime = info->atime;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index e0ff9d156f6f..1b9f587f6cca 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -569,7 +569,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -582,8 +582,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = inode->i_ctime =
- EXFAT_I(inode)->i_crtime = current_time(inode);
+ inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
@@ -817,7 +816,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = current_time(dir);
+ dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
exfat_truncate_atime(&dir->i_atime);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
@@ -825,7 +824,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
mark_inode_dirty(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
@@ -852,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
else
@@ -866,8 +865,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
goto unlock;
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_atime = inode->i_ctime =
- EXFAT_I(inode)->i_crtime = current_time(inode);
+ inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
/* timestamp is already written, so mark_inode_dirty() is unneeded. */
@@ -979,7 +977,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
ei->dir.dir = DIR_DELETED;
inode_inc_iversion(dir);
- dir->i_mtime = dir->i_atime = current_time(dir);
+ dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir);
exfat_truncate_atime(&dir->i_atime);
if (IS_DIRSYNC(dir))
exfat_sync_inode(dir);
@@ -988,7 +986,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
drop_nlink(dir);
clear_nlink(inode);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
exfat_unhash_inode(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
@@ -1312,8 +1310,8 @@ static int exfat_rename(struct mnt_idmap *idmap,
goto unlock;
inode_inc_iversion(new_dir);
- new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime =
- EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
exfat_truncate_atime(&new_dir->i_atime);
if (IS_DIRSYNC(new_dir))
exfat_sync_inode(new_dir);
@@ -1336,7 +1334,6 @@ static int exfat_rename(struct mnt_idmap *idmap,
}
inode_inc_iversion(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
if (IS_DIRSYNC(old_dir))
exfat_sync_inode(old_dir);
else
@@ -1354,8 +1351,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
exfat_warn(sb, "abnormal access to an inode dropped");
WARN_ON(new_inode->i_nlink == 0);
}
- new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime =
- current_time(new_inode);
+ EXFAT_I(new_inode)->i_crtime = current_time(new_inode);
}
unlock:
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8c32460e031e..2778bd9b631e 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -31,16 +31,6 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi)
kfree(sbi->options.iocharset);
}
-static void exfat_delayed_free(struct rcu_head *p)
-{
- struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
-
- unload_nls(sbi->nls_io);
- exfat_free_iocharset(sbi);
- exfat_free_upcase_table(sbi);
- kfree(sbi);
-}
-
static void exfat_put_super(struct super_block *sb)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -50,7 +40,8 @@ static void exfat_put_super(struct super_block *sb)
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
- call_rcu(&sbi->rcu, exfat_delayed_free);
+ unload_nls(sbi->nls_io);
+ exfat_free_upcase_table(sbi);
}
static int exfat_sync_fs(struct super_block *sb, int wait)
@@ -379,8 +370,7 @@ static int exfat_read_root(struct inode *inode)
ei->i_size_ondisk = i_size_read(inode);
exfat_save_attr(inode, ATTR_SUBDIR);
- inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
- current_time(inode);
+ inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode);
exfat_truncate_atime(&inode->i_atime);
return 0;
}
@@ -710,9 +700,6 @@ free_table:
check_nls_io:
unload_nls(sbi->nls_io);
- exfat_free_iocharset(sbi);
- sb->s_fs_info = NULL;
- kfree(sbi);
return err;
}
@@ -721,14 +708,18 @@ static int exfat_get_tree(struct fs_context *fc)
return get_tree_bdev(fc, exfat_fill_super);
}
+static void exfat_free_sbi(struct exfat_sb_info *sbi)
+{
+ exfat_free_iocharset(sbi);
+ kfree(sbi);
+}
+
static void exfat_free(struct fs_context *fc)
{
struct exfat_sb_info *sbi = fc->s_fs_info;
- if (sbi) {
- exfat_free_iocharset(sbi);
- kfree(sbi);
- }
+ if (sbi)
+ exfat_free_sbi(sbi);
}
static int exfat_reconfigure(struct fs_context *fc)
@@ -773,12 +764,21 @@ static int exfat_init_fs_context(struct fs_context *fc)
return 0;
}
+static void exfat_kill_sb(struct super_block *sb)
+{
+ struct exfat_sb_info *sbi = sb->s_fs_info;
+
+ kill_block_super(sb);
+ if (sbi)
+ exfat_free_sbi(sbi);
+}
+
static struct file_system_type exfat_fs_type = {
.owner = THIS_MODULE,
.name = "exfat",
.init_fs_context = exfat_init_fs_context,
.parameters = exfat_parameters,
- .kill_sb = kill_block_super,
+ .kill_sb = exfat_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index d1dbe47c7975..c20704aa21b3 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -386,6 +386,7 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
* @inode: the object to encode
* @fid: where to store the file handle fragment
* @max_len: maximum length to store there
+ * @parent: parent directory inode, if wanted
* @flags: properties of the requested file handle
*
* Returns an enum fid_type or a negative errno.
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 77393fda99af..74d98965902e 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS
tristate "Second extended fs support"
+ select BUFFER_HEAD
select FS_IOMAP
select LEGACY_DIRECT_IO
help
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 82b17d7fc93f..7e54c31589c7 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -237,7 +237,7 @@ ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
error = __ext2_set_acl(inode, acl, type);
if (!error && update_mode) {
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
return error;
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index eca60b747c6b..e124f3d709b2 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -36,8 +36,6 @@
*/
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
unsigned int block_group,
struct buffer_head ** bh)
@@ -474,8 +472,8 @@ void ext2_discard_reservation(struct inode *inode)
* @block: start physical block to free
* @count: number of blocks to free
*/
-void ext2_free_blocks (struct inode * inode, unsigned long block,
- unsigned long count)
+void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
+ unsigned long count)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head * bh2;
@@ -718,36 +716,34 @@ fail_access:
}
/**
- * find_next_reservable_window():
- * find a reservable space within the given range.
- * It does not allocate the reservation window for now:
- * alloc_new_reservation() will do the work later.
- *
- * @search_head: the head of the searching list;
- * This is not necessarily the list head of the whole filesystem
+ * find_next_reservable_window - Find a reservable space within the given range.
+ * @search_head: The list to search.
+ * @my_rsv: The reservation we're currently using.
+ * @sb: The super block.
+ * @start_block: The first block we consider to start the real search from
+ * @last_block: The maximum block number that our goal reservable space
+ * could start from.
*
- * We have both head and start_block to assist the search
- * for the reservable space. The list starts from head,
- * but we will shift to the place where start_block is,
- * then start from there, when looking for a reservable space.
+ * It does not allocate the reservation window: alloc_new_reservation()
+ * will do the work later.
*
- * @sb: the super block.
+ * We search the given range, rather than the whole reservation double
+ * linked list, (start_block, last_block) to find a free region that is
+ * of my size and has not been reserved.
*
- * @start_block: the first block we consider to start the real search from
+ * @search_head is not necessarily the list head of the whole filesystem.
+ * We have both head and @start_block to assist the search for the
+ * reservable space. The list starts from head, but we will shift to
+ * the place where start_block is, then start from there, when looking
+ * for a reservable space.
*
- * @last_block:
- * the maximum block number that our goal reservable space
- * could start from. This is normally the last block in this
- * group. The search will end when we found the start of next
- * possible reservable space is out of this boundary.
- * This could handle the cross boundary reservation window
- * request.
- *
- * basically we search from the given range, rather than the whole
- * reservation double linked list, (start_block, last_block)
- * to find a free region that is of my size and has not
- * been reserved.
+ * @last_block is normally the last block in this group. The search will end
+ * when we found the start of next possible reservable space is out
+ * of this boundary. This could handle the cross boundary reservation
+ * window request.
*
+ * Return: -1 if we could not find a range of sufficient size. If we could,
+ * return 0 and fill in @my_rsv with the range information.
*/
static int find_next_reservable_window(
struct ext2_reserve_window_node *search_head,
@@ -835,41 +831,34 @@ static int find_next_reservable_window(
}
/**
- * alloc_new_reservation()--allocate a new reservation window
- *
- * To make a new reservation, we search part of the filesystem
- * reservation list (the list that inside the group). We try to
- * allocate a new reservation window near the allocation goal,
- * or the beginning of the group, if there is no goal.
+ * alloc_new_reservation - Allocate a new reservation window.
+ * @my_rsv: The reservation we're currently using.
+ * @grp_goal: The goal block relative to the start of the group.
+ * @sb: The super block.
+ * @group: The group we are trying to allocate in.
+ * @bitmap_bh: The block group block bitmap.
*
- * We first find a reservable space after the goal, then from
- * there, we check the bitmap for the first free block after
- * it. If there is no free block until the end of group, then the
- * whole group is full, we failed. Otherwise, check if the free
- * block is inside the expected reservable space, if so, we
- * succeed.
- * If the first free block is outside the reservable space, then
- * start from the first free block, we search for next available
- * space, and go on.
+ * To make a new reservation, we search part of the filesystem reservation
+ * list (the list inside the group). We try to allocate a new
+ * reservation window near @grp_goal, or the beginning of the
+ * group, if @grp_goal is negative.
*
- * on succeed, a new reservation will be found and inserted into the list
- * It contains at least one free block, and it does not overlap with other
- * reservation windows.
+ * We first find a reservable space after the goal, then from there,
+ * we check the bitmap for the first free block after it. If there is
+ * no free block until the end of group, then the whole group is full,
+ * we failed. Otherwise, check if the free block is inside the expected
+ * reservable space, if so, we succeed.
*
- * failed: we failed to find a reservation window in this group
+ * If the first free block is outside the reservable space, then start
+ * from the first free block, we search for next available space, and
+ * go on.
*
- * @my_rsv: the reservation
- *
- * @grp_goal: The goal (group-relative). It is where the search for a
- * free reservable space should start from.
- * if we have a goal(goal >0 ), then start from there,
- * no goal(goal = -1), we start from the first block
- * of the group.
- *
- * @sb: the super block
- * @group: the group we are trying to allocate in
- * @bitmap_bh: the block group block bitmap
+ * on succeed, a new reservation will be found and inserted into the
+ * list. It contains at least one free block, and it does not overlap
+ * with other reservation windows.
*
+ * Return: 0 on success, -1 if we failed to find a reservation window
+ * in this group
*/
static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv,
ext2_grpblk_t grp_goal, struct super_block *sb,
@@ -1133,8 +1122,13 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
if ((my_rsv->rsv_start > group_last_block) ||
(my_rsv->rsv_end < group_first_block)) {
+ ext2_error(sb, __func__,
+ "Reservation out of group %u range goal %d fsb[%lu,%lu] rsv[%lu, %lu]",
+ group, grp_goal, group_first_block,
+ group_last_block, my_rsv->rsv_start,
+ my_rsv->rsv_end);
rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1);
- BUG();
+ return -1;
}
ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal,
&num, &my_rsv->rsv_window);
@@ -1195,6 +1189,7 @@ int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk,
* @goal: given target block(filesystem wide)
* @count: target number of blocks to allocate
* @errp: error code
+ * @flags: allocate flags
*
* ext2_new_blocks uses a goal block to assist allocation. If the goal is
* free, or there is a free block within 32 blocks of the goal, that block
@@ -1204,7 +1199,7 @@ int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk,
* This function also updates quota and i_blocks field.
*/
ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
- unsigned long *count, int *errp)
+ unsigned long *count, int *errp, unsigned int flags)
{
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *gdp_bh;
@@ -1243,15 +1238,15 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
es = EXT2_SB(sb)->s_es;
ext2_debug("goal=%lu.\n", goal);
/*
- * Allocate a block from reservation only when
- * filesystem is mounted with reservation(default,-o reservation), and
- * it's a regular file, and
- * the desired window size is greater than 0 (One could use ioctl
- * command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn off
- * reservation on that particular file)
+ * Allocate a block from reservation only when the filesystem is
+ * mounted with reservation(default,-o reservation), and it's a regular
+ * file, and the desired window size is greater than 0 (One could use
+ * ioctl command EXT2_IOC_SETRSVSZ to set the window size to 0 to turn
+ * off reservation on that particular file). Also do not use the
+ * reservation window if the caller asked us not to do it.
*/
block_i = EXT2_I(inode)->i_block_alloc_info;
- if (block_i) {
+ if (!(flags & EXT2_ALLOC_NORESERVE) && block_i) {
windowsz = block_i->rsv_window_node.rsv_goal_size;
if (windowsz > 0)
my_rsv = &block_i->rsv_window_node;
@@ -1431,13 +1426,6 @@ out:
return 0;
}
-ext2_fsblk_t ext2_new_block(struct inode *inode, unsigned long goal, int *errp)
-{
- unsigned long count = 1;
-
- return ext2_new_blocks(inode, goal, &count, errp);
-}
-
#ifdef EXT2FS_DEBUG
unsigned long ext2_count_free(struct buffer_head *map, unsigned int numchars)
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 42db804794bd..b335f17f682f 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -468,7 +468,7 @@ int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de,
ext2_set_de_type(de, inode);
ext2_commit_chunk(page, pos, len);
if (update_times)
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
return ext2_handle_dirsync(dir);
@@ -555,7 +555,7 @@ got_it:
de->inode = cpu_to_le32(inode->i_ino);
ext2_set_de_type (de, inode);
ext2_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(dir);
err = ext2_handle_dirsync(dir);
@@ -606,7 +606,7 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page)
pde->rec_len = ext2_rec_len_to_disk(to - from);
dir->inode = 0;
ext2_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL;
mark_inode_dirty(inode);
return ext2_handle_dirsync(inode);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 35a041c47c38..7fdd685c384d 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -399,6 +399,12 @@ struct ext2_inode {
#define EXT2_ERRORS_DEFAULT EXT2_ERRORS_CONTINUE
/*
+ * Allocation flags
+ */
+#define EXT2_ALLOC_NORESERVE 0x1 /* Do not use reservation
+ * window for allocation */
+
+/*
* Structure of the super block
*/
struct ext2_super_block {
@@ -695,13 +701,11 @@ static inline struct ext2_inode_info *EXT2_I(struct inode *inode)
/* balloc.c */
extern int ext2_bg_has_super(struct super_block *sb, int group);
extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
-extern ext2_fsblk_t ext2_new_block(struct inode *, unsigned long, int *);
-extern ext2_fsblk_t ext2_new_blocks(struct inode *, unsigned long,
- unsigned long *, int *);
+extern ext2_fsblk_t ext2_new_blocks(struct inode *, ext2_fsblk_t,
+ unsigned long *, int *, unsigned int);
extern int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk,
unsigned int count);
-extern void ext2_free_blocks (struct inode *, unsigned long,
- unsigned long);
+extern void ext2_free_blocks(struct inode *, ext2_fsblk_t, unsigned long);
extern unsigned long ext2_count_free_blocks (struct super_block *);
extern unsigned long ext2_count_dirs (struct super_block *);
extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 0b4c91c62e1f..1039e5bf90af 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -103,7 +103,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
}
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops);
+ ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops);
filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index a4e1d7a9c544..c24d0de95a83 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -273,7 +273,6 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
if ((parent == d_inode(sb->s_root)) ||
(EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) {
- struct ext2_group_desc *best_desc = NULL;
int best_ndir = inodes_per_group;
int best_group = -1;
@@ -291,10 +290,8 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
continue;
best_group = group;
best_ndir = le16_to_cpu(desc->bg_used_dirs_count);
- best_desc = desc;
}
if (best_group >= 0) {
- desc = best_desc;
group = best_group;
goto found;
}
@@ -549,7 +546,7 @@ got:
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_flags =
ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 75983215c7a1..314b415ee518 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -385,12 +385,16 @@ ext2_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
}
/**
- * ext2_alloc_blocks: multiple allocate blocks needed for a branch
- * @indirect_blks: the number of blocks need to allocate for indirect
- * blocks
- * @blks: the number of blocks need to allocate for direct blocks
- * @new_blocks: on return it will store the new block numbers for
- * the indirect blocks(if needed) and the first direct block,
+ * ext2_alloc_blocks: Allocate multiple blocks needed for a branch.
+ * @inode: Owner.
+ * @goal: Preferred place for allocation.
+ * @indirect_blks: The number of blocks needed to allocate for indirect blocks.
+ * @blks: The number of blocks need to allocate for direct blocks.
+ * @new_blocks: On return it will store the new block numbers for
+ * the indirect blocks(if needed) and the first direct block.
+ * @err: Error pointer.
+ *
+ * Return: Number of blocks allocated.
*/
static int ext2_alloc_blocks(struct inode *inode,
ext2_fsblk_t goal, int indirect_blks, int blks,
@@ -415,7 +419,7 @@ static int ext2_alloc_blocks(struct inode *inode,
while (1) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext2_new_blocks(inode,goal,&count,err);
+ current_block = ext2_new_blocks(inode, goal, &count, err, 0);
if (*err)
goto failed_out;
@@ -595,7 +599,7 @@ static void ext2_splice_branch(struct inode *inode,
if (where->bh)
mark_buffer_dirty_inode(where->bh, inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
@@ -1082,8 +1086,8 @@ no_top:
*/
static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
{
- unsigned long block_to_free = 0, count = 0;
- unsigned long nr;
+ ext2_fsblk_t block_to_free = 0, count = 0;
+ ext2_fsblk_t nr;
for ( ; p < q ; p++) {
nr = le32_to_cpu(*p);
@@ -1123,7 +1127,7 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int depth)
{
struct buffer_head * bh;
- unsigned long nr;
+ ext2_fsblk_t nr;
if (depth--) {
int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb);
@@ -1287,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
__ext2_truncate_blocks(inode, newsize);
filemap_invalidate_unlock(inode->i_mapping);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (inode_needs_sync(inode)) {
sync_mapping_buffers(inode->i_mapping);
sync_inode_metadata(inode, 1);
@@ -1409,9 +1413,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le32_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime);
- inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime);
+ inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0);
inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime);
- inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+ inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
* This is needed because nfsd might try to access dead inodes
@@ -1541,7 +1545,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le32(inode->i_size);
raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
- raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+ raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec);
raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
@@ -1628,7 +1632,7 @@ int ext2_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index cc87d413eb43..44e04484e570 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -44,7 +44,7 @@ int ext2_fileattr_set(struct mnt_idmap *idmap,
(fa->flags & EXT2_FL_USER_MODIFIABLE);
ext2_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return 0;
@@ -77,7 +77,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
inode_lock(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode->i_generation = generation;
inode_unlock(inode);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 937dd8f60f96..059517068adc 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -211,7 +211,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
if (err)
return err;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -291,7 +291,7 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
err = 0;
out:
@@ -367,7 +367,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
ext2_put_page(new_page, new_de);
if (err)
goto out_dir;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
@@ -383,7 +383,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
err = ext2_delete_entry(old_de, old_page);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 2959afc7541c..aaf3e3e88cb2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1572,7 +1572,7 @@ out:
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 8906ba479aaf..20f741184673 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -742,10 +742,13 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
/* We need to allocate a new block */
ext2_fsblk_t goal = ext2_group_first_block_no(sb,
EXT2_I(inode)->i_block_group);
- int block = ext2_new_block(inode, goal, &error);
+ unsigned long count = 1;
+ ext2_fsblk_t block = ext2_new_blocks(inode, goal,
+ &count, &error,
+ EXT2_ALLOC_NORESERVE);
if (error)
goto cleanup;
- ea_idebug(inode, "creating block %d", block);
+ ea_idebug(inode, "creating block %lu", block);
new_bh = sb_getblk(sb, block);
if (unlikely(!new_bh)) {
@@ -773,7 +776,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
/* Update the inode. */
EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode)) {
error = sync_inode_metadata(inode, 1);
/* In case sync failed due to ENOSPC the inode was actually
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 86699c8cab28..e20d59221fc0 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -28,6 +28,7 @@ config EXT3_FS_SECURITY
config EXT4_FS
tristate "The Extended 4 (ext4) filesystem"
+ select BUFFER_HEAD
select JBD2
select CRC16
select CRYPTO
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 27fcbddfb148..3bffe862f954 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -259,7 +259,7 @@ retry:
error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */);
if (!error && update_mode) {
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
error = ext4_mark_inode_dirty(handle, inode);
}
out_stop:
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1f72f977c6db..79b20d6ae39e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -913,11 +913,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
}
/*
- * This function returns the number of file system metadata clusters at
+ * This function returns the number of file system metadata blocks at
* the beginning of a block group, including the reserved gdt blocks.
*/
-static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
- ext4_group_t block_group)
+unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
+ ext4_group_t block_group)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned num;
@@ -935,8 +935,15 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
} else { /* For META_BG_BLOCK_GROUPS */
num += ext4_bg_num_gdb_meta(sb, block_group);
}
- return EXT4_NUM_B2C(sbi, num);
+ return num;
}
+
+static unsigned int ext4_num_base_meta_clusters(struct super_block *sb,
+ ext4_group_t block_group)
+{
+ return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group));
+}
+
/**
* ext4_inode_to_goal_block - return a hint for block allocation
* @inode: inode for block allocation
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 5504f72bbbbe..6fe3c941b565 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_block *sb)
struct ext4_system_blocks *system_blks;
struct ext4_group_desc *gdp;
ext4_group_t i;
- int flex_size = ext4_flex_bg_size(sbi);
int ret;
system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL);
@@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_block *sb)
return -ENOMEM;
for (i=0; i < ngroups; i++) {
+ unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i);
+
cond_resched();
- if (ext4_bg_has_super(sb, i) &&
- ((i < 5) || ((i % flex_size) == 0))) {
+ if (meta_blks != 0) {
ret = add_system_zone(system_blks,
ext4_group_first_block_no(sb, i),
- ext4_bg_num_gdb(sb, i) + 1, 0);
+ meta_blks, 0);
if (ret)
goto err;
}
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index e20ac0654b3f..453d4da5de52 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -33,6 +33,8 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, iname, fname);
+ if (err)
+ ext4_fname_free_filename(fname);
#endif
return err;
}
@@ -51,6 +53,8 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
#if IS_ENABLED(CONFIG_UNICODE)
err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname);
+ if (err)
+ ext4_fname_free_filename(fname);
#endif
return err;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0a2d55faa095..9418359b1d9d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -176,9 +176,6 @@ enum criteria {
EXT4_MB_NUM_CRS
};
-/* criteria below which we use fast block scanning and avoid unnecessary IO */
-#define CR_FAST CR_GOAL_LEN_SLOW
-
/*
* Flags used in mballoc's allocation_context flags field.
*
@@ -868,64 +865,70 @@ struct ext4_inode {
* affected filesystem before 2242.
*/
-static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
+static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
- u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
- return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
+ u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
+ return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}
-static inline void ext4_decode_extra_time(struct timespec64 *time,
- __le32 extra)
+static inline struct timespec64 ext4_decode_extra_time(__le32 base,
+ __le32 extra)
{
+ struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };
+
if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
- time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
+ ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ return ts;
}
-#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \
do { \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
- (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(inode)->xtime); \
- } \
- else \
- (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \
+ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
+ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \
+ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \
+ } else \
+ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \
} while (0)
-#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- (raw_inode)->xtime ## _extra = \
- ext4_encode_extra_time(&(einode)->xtime); \
-} while (0)
+#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime)
+
+#define EXT4_INODE_SET_CTIME(inode, raw_inode) \
+ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))
+
+#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \
+ raw_inode, (einode)->xtime)
+
+#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \
+ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \
+ ext4_decode_extra_time((raw_inode)->xtime, \
+ (raw_inode)->xtime ## _extra) : \
+ (struct timespec64) { \
+ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \
+ })
#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \
do { \
- (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
- if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
- ext4_decode_extra_time(&(inode)->xtime, \
- raw_inode->xtime ## _extra); \
- } \
- else \
- (inode)->xtime.tv_nsec = 0; \
+ (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \
} while (0)
+#define EXT4_INODE_GET_CTIME(inode, raw_inode) \
+do { \
+ inode_set_ctime_to_ts(inode, \
+ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \
+} while (0)
-#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
-do { \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
- (einode)->xtime.tv_sec = \
- (signed)le32_to_cpu((raw_inode)->xtime); \
- else \
- (einode)->xtime.tv_sec = 0; \
- if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
- ext4_decode_extra_time(&(einode)->xtime, \
- raw_inode->xtime ## _extra); \
- else \
- (einode)->xtime.tv_nsec = 0; \
+#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \
+do { \
+ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
+ (einode)->xtime = \
+ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \
+ raw_inode); \
+ else \
+ (einode)->xtime = (struct timespec64){0, 0}; \
} while (0)
#define i_disk_version osd1.linux1.l_i_version
@@ -1235,6 +1238,7 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group
* scanning in mballoc
*/
+#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1252,10 +1256,8 @@ struct ext4_inode_info {
#define ext4_test_and_set_bit __test_and_set_bit_le
#define ext4_set_bit __set_bit_le
-#define ext4_set_bit_atomic ext2_set_bit_atomic
#define ext4_test_and_clear_bit __test_and_clear_bit_le
#define ext4_clear_bit __clear_bit_le
-#define ext4_clear_bit_atomic ext2_clear_bit_atomic
#define ext4_test_bit test_bit_le
#define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le
@@ -1702,10 +1704,13 @@ struct ext4_sb_info {
const char *s_last_error_func;
time64_t s_last_error_time;
/*
- * If we are in a context where we cannot update error information in
- * the on-disk superblock, we queue this work to do it.
+ * If we are in a context where we cannot update the on-disk
+ * superblock, we queue the work here. This is used to update
+ * the error information in the superblock, and for periodic
+ * updates of the superblock called from the commit callback
+ * function.
*/
- struct work_struct s_error_work;
+ struct work_struct s_sb_upd_work;
/* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
@@ -1798,7 +1803,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/
enum {
EXT4_MF_MNTDIR_SAMPLED,
- EXT4_MF_FS_ABORTED, /* Fatal error detected */
EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
};
@@ -2222,9 +2226,9 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
#define EXT4_FLAGS_SHUTDOWN 1
#define EXT4_FLAGS_BDEV_IS_DAX 2
-static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
+static inline int ext4_forced_shutdown(struct super_block *sb)
{
- return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+ return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}
/*
@@ -2702,7 +2706,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
-extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
@@ -2858,7 +2861,6 @@ extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
-extern void ext4_check_inodes_bitmap(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
ext4_group_t group, int barrier);
@@ -2901,7 +2903,6 @@ extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
-extern int ext4_mb_reserve_blocks(struct super_block *, int);
extern void ext4_discard_preallocations(struct inode *, unsigned int);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
@@ -2924,6 +2925,10 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
int len, int state);
+static inline bool ext4_mb_cr_expensive(enum criteria cr)
+{
+ return cr >= CR_GOAL_LEN_SLOW;
+}
/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
@@ -2977,7 +2982,6 @@ extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int ext4_file_getattr(struct mnt_idmap *, const struct path *,
struct kstat *, u32, unsigned int);
-extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
@@ -3084,6 +3088,8 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno,
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
+extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
+ ext4_group_t block_group);
extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
@@ -3525,8 +3531,6 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
int ext4_readpage_inline(struct inode *inode, struct folio *folio);
@@ -3774,8 +3778,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-
/* For ioend & aio unwritten conversion wait queues */
#define EXT4_WQ_HASH_SZ 37
#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 77f318ec8abb..d1a2e6624401 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -67,11 +67,12 @@ static int ext4_journal_check_start(struct super_block *sb)
might_sleep();
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
- if (sb_rdonly(sb))
+ if (WARN_ON_ONCE(sb_rdonly(sb)))
return -EROFS;
+
WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
journal = EXT4_SB(sb)->s_journal;
/*
@@ -234,8 +235,7 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
- if (bh->b_bdev->bd_super)
- ext4_check_bdev_write_error(bh->b_bdev->bd_super);
+ ext4_check_bdev_write_error(sb);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e4115d338f10..202c76996b62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4476,12 +4476,12 @@ retry:
map.m_lblk += ret;
map.m_len = len = len - ret;
epos = (loff_t)map.m_lblk << inode->i_blkbits;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
- inode->i_mtime = inode->i_ctime;
+ inode->i_mtime = inode_get_ctime(inode);
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4617,7 +4617,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags);
@@ -4642,7 +4642,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
goto out_mutex;
}
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
@@ -5378,7 +5378,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -5488,7 +5488,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
goto out_stop;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9b5b8951afb4..6f7de14c0fa8 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -878,23 +878,29 @@ retry:
err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
err2 = __es_insert_extent(inode, &newes, es2);
if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
err2 = 0;
if (err2 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
(status & EXTENT_STATUS_WRITTEN ||
status & EXTENT_STATUS_UNWRITTEN))
__revise_pending(inode, lblk, len);
-
- /* es is pre-allocated but not used, free it. */
- if (es1 && !es1->es_len)
- __es_free_extent(es1);
- if (es2 && !es2->es_len)
- __es_free_extent(es2);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err1 || err2)
@@ -1491,8 +1497,12 @@ retry:
*/
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end, &reserved, es);
- if (es && !es->es_len)
- __es_free_extent(es);
+ /* Free preallocated extent if it didn't get used. */
+ if (es) {
+ if (!es->es_len)
+ __es_free_extent(es);
+ es = NULL;
+ }
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err)
goto retry;
@@ -2047,19 +2057,25 @@ retry:
err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
if (err1 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
err2 = __es_insert_extent(inode, &newes, es2);
if (err2 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
if (allocated)
__insert_pending(inode, lblk);
-
- /* es is pre-allocated but not used, free it. */
- if (es1 && !es1->es_len)
- __es_free_extent(es1);
- if (es2 && !es2->es_len)
- __es_free_extent(es2);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err1 || err2)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index c457c8517f0f..6830ea3a6c59 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -131,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (!iov_iter_count(to))
@@ -153,7 +153,7 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos,
{
struct inode *inode = file_inode(in);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
return filemap_splice_read(in, ppos, pipe, len, flags);
}
@@ -476,6 +476,11 @@ restart:
* required to change security info in file_modified(), for extending
* I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
* extents (as partial block zeroing may be required).
+ *
+ * Note that unaligned writes are allowed under shared lock so long as
+ * they are pure overwrites. Otherwise, concurrent unaligned writes risk
+ * data corruption due to partial block zeroing in the dio layer, and so
+ * the I/O must occur exclusively.
*/
if (*ilock_shared &&
((!IS_NOSEC(inode) || *extend || !overwrite ||
@@ -492,21 +497,12 @@ restart:
/*
* Now that locking is settled, determine dio flags and exclusivity
- * requirements. Unaligned writes are allowed under shared lock so long
- * as they are pure overwrites. Set the iomap overwrite only flag as an
- * added precaution in this case. Even though this is unnecessary, we
- * can detect and warn on unexpected -EAGAIN if an unsafe unaligned
- * write is ever submitted.
- *
- * Otherwise, concurrent unaligned writes risk data corruption due to
- * partial block zeroing in the dio layer, and so the I/O must occur
- * exclusively. The inode lock is already held exclusive if the write is
- * non-overwrite or extending, so drain all outstanding dio and set the
- * force wait dio flag.
+ * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce
+ * behavior already. The inode lock is already held exclusive if the
+ * write is non-overwrite or extending, so drain all outstanding dio and
+ * set the force wait dio flag.
*/
- if (*ilock_shared && unaligned_io) {
- *dio_flags = IOMAP_DIO_OVERWRITE_ONLY;
- } else if (!*ilock_shared && (unaligned_io || *extend)) {
+ if (!*ilock_shared && (unaligned_io || *extend)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
@@ -608,7 +604,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
dio_flags, NULL, 0);
- WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT));
if (ret == -ENOTBLK)
ret = 0;
@@ -709,7 +704,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
#ifdef CONFIG_FS_DAX
@@ -723,8 +718,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
#ifdef CONFIG_FS_DAX
-static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
{
int error = 0;
vm_fault_t result;
@@ -740,7 +734,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
* read-only.
*
* We check for VM_SHARED rather than vmf->cow_page since the latter is
- * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for
+ * unset for order != 0 (i.e. only in do_cow_fault); for
* other sizes, dax_iomap_fault will handle splitting / fallback so that
* we eventually come back with a COW page.
*/
@@ -764,7 +758,7 @@ retry:
} else {
filemap_invalidate_lock_shared(mapping);
}
- result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
+ result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
if (write) {
ext4_journal_stop(handle);
@@ -773,7 +767,7 @@ retry:
goto retry;
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
- result = dax_finish_sync_fault(vmf, pe_size, pfn);
+ result = dax_finish_sync_fault(vmf, order, pfn);
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(sb);
} else {
@@ -785,7 +779,7 @@ retry:
static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
- return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
+ return ext4_dax_huge_fault(vmf, 0);
}
static const struct vm_operations_struct ext4_dax_vm_ops = {
@@ -807,10 +801,9 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file->f_mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct dax_device *dax_dev = sbi->s_daxdev;
+ struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- if (unlikely(ext4_forced_shutdown(sbi)))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
/*
@@ -886,7 +879,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0c56f3a011a1..b40d3b29f7e5 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -131,9 +131,8 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
int ret = 0, err;
bool needs_barrier = false;
struct inode *inode = file->f_mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
ASSERT(ext4_journal_current_handle() == NULL);
@@ -141,14 +140,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trace_ext4_sync_file_enter(file, datasync);
if (sb_rdonly(inode->i_sb)) {
- /* Make sure that we read updated s_mount_flags value */
+ /* Make sure that we read updated s_ext4_flags value */
smp_rmb();
- if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))
+ if (ext4_forced_shutdown(inode->i_sb))
ret = -EROFS;
goto out;
}
- if (!sbi->s_journal) {
+ if (!EXT4_SB(inode->i_sb)->s_journal) {
ret = ext4_fsync_nojournal(file, start, end, datasync,
&needs_barrier);
if (needs_barrier)
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 46c3423ddfa1..deabe29da7fb 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -300,7 +300,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
- if (len && IS_CASEFOLDED(dir) && um &&
+ if (len && IS_CASEFOLDED(dir) &&
(!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
if (!buff)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 754f961cd9fd..b65058d972f9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -950,7 +950,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
sb = dir->i_sb;
sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
+ if (unlikely(ext4_forced_shutdown(sb)))
return ERR_PTR(-EIO);
ngroups = ext4_get_groups_count(sb);
@@ -1250,7 +1250,7 @@ got:
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
ei->i_crtime = inode->i_mtime;
memset(ei->i_data, 0, sizeof(ei->i_data));
@@ -1523,12 +1523,6 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
int num, ret = 0, used_blks = 0;
unsigned long used_inos = 0;
- /* This should not happen, but just to be sure check this */
- if (sb_rdonly(sb)) {
- ret = 1;
- goto out;
- }
-
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp || !grp)
goto out;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index a4b7e4bc32d4..012d9259ff53 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -228,7 +228,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
struct ext4_inode *raw_inode;
int cp_len = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return;
BUG_ON(!EXT4_I(inode)->i_inline_off);
@@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
return 1;
@@ -1991,7 +1991,7 @@ out:
ext4_orphan_del(handle, inode);
if (err == 0) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c
index 7935ea6cf92c..f0c0fd507fbc 100644
--- a/fs/ext4/inode-test.c
+++ b/fs/ext4/inode-test.c
@@ -245,9 +245,9 @@ static void inode_test_xtimestamp_decoding(struct kunit *test)
struct timestamp_expectation *test_param =
(struct timestamp_expectation *)(test->param_value);
- timestamp.tv_sec = get_32bit_time(test_param);
- ext4_decode_extra_time(&timestamp,
- cpu_to_le32(test_param->extra_bits));
+ timestamp = ext4_decode_extra_time(
+ cpu_to_le32(get_32bit_time(test_param)),
+ cpu_to_le32(test_param->extra_bits));
KUNIT_EXPECT_EQ_MSG(test,
test_param->expected.tv_sec,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 43775a6ca505..4ce35f1c8b0a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1114,7 +1114,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
trace_ext4_write_begin(inode, pos, len);
@@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
if (folio->index < mpd->first_page)
continue;
- if (folio->index + folio_nr_pages(folio) - 1 > end)
+ if (folio_next_index(folio) - 1 > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@@ -2213,8 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
if (err < 0) {
struct super_block *sb = inode->i_sb;
- if (ext4_forced_shutdown(EXT4_SB(sb)) ||
- ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ if (ext4_forced_shutdown(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
@@ -2455,7 +2454,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len == 0)
mpd->first_page = folio->index;
- mpd->next_page = folio->index + folio_nr_pages(folio);
+ mpd->next_page = folio_next_index(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@@ -2534,14 +2533,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
* will obscure the real source of the problem. We test
- * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because
+ * fs shutdown state instead of sb->s_flag's SB_RDONLY because
* the latter could be true if the filesystem is mounted
* read-only, and in that case, ext4_writepages should
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
- ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) {
+ if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
ret = -EROFS;
goto out_writepages;
}
@@ -2759,7 +2757,7 @@ static int ext4_writepages(struct address_space *mapping,
int ret;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return -EIO;
alloc_ctx = ext4_writepages_down_read(sb);
@@ -2798,16 +2796,16 @@ static int ext4_dax_writepages(struct address_space *mapping,
int ret;
long nr_to_write = wbc->nr_to_write;
struct inode *inode = mapping->host;
- struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
- ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
+ ret = dax_writeback_mapping_range(mapping,
+ EXT4_SB(inode->i_sb)->s_daxdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
ext4_writepages_up_read(inode->i_sb, alloc_ctx);
@@ -2857,7 +2855,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
struct inode *inode = mapping->host;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
index = pos >> PAGE_SHIFT;
@@ -2937,14 +2935,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio,
return 1;
}
+static int ext4_da_do_write_end(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page)
+{
+ struct inode *inode = mapping->host;
+ loff_t old_size = inode->i_size;
+ bool disksize_changed = false;
+ loff_t new_i_size;
+
+ /*
+ * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
+ * flag, which all that's needed to trigger page writeback.
+ */
+ copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL);
+ new_i_size = pos + copied;
+
+ /*
+ * It's important to update i_size while still holding page lock,
+ * because page writeout could otherwise come in and zero beyond
+ * i_size.
+ *
+ * Since we are holding inode lock, we are sure i_disksize <=
+ * i_size. We also know that if i_disksize < i_size, there are
+ * delalloc writes pending in the range up to i_size. If the end of
+ * the current write is <= i_size, there's no need to touch
+ * i_disksize since writeback will push i_disksize up to i_size
+ * eventually. If the end of the current write is > i_size and
+ * inside an allocated block which ext4_da_should_update_i_disksize()
+ * checked, we need to update i_disksize here as certain
+ * ext4_writepages() paths not allocating blocks and update i_disksize.
+ */
+ if (new_i_size > inode->i_size) {
+ unsigned long end;
+
+ i_size_write(inode, new_i_size);
+ end = (new_i_size - 1) & (PAGE_SIZE - 1);
+ if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) {
+ ext4_update_i_disksize(inode, new_i_size);
+ disksize_changed = true;
+ }
+ }
+
+ unlock_page(page);
+ put_page(page);
+
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
+
+ if (disksize_changed) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ }
+
+ return copied;
+}
+
static int ext4_da_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
- loff_t new_i_size;
- unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
struct folio *folio = page_folio(page);
@@ -2963,30 +3020,7 @@ static int ext4_da_write_end(struct file *file,
if (unlikely(copied < len) && !PageUptodate(page))
copied = 0;
- start = pos & (PAGE_SIZE - 1);
- end = start + copied - 1;
-
- /*
- * Since we are holding inode lock, we are sure i_disksize <=
- * i_size. We also know that if i_disksize < i_size, there are
- * delalloc writes pending in the range upto i_size. If the end of
- * the current write is <= i_size, there's no need to touch
- * i_disksize since writeback will push i_disksize upto i_size
- * eventually. If the end of the current write is > i_size and
- * inside an allocated block (ext4_da_should_update_i_disksize()
- * check), we need to update i_disksize here as certain
- * ext4_writepages() paths not allocating blocks update i_disksize.
- *
- * Note that we defer inode dirtying to generic_write_end() /
- * ext4_da_write_inline_data_end().
- */
- new_i_size = pos + copied;
- if (copied && new_i_size > inode->i_size &&
- ext4_da_should_update_i_disksize(folio, end))
- ext4_update_i_disksize(inode, new_i_size);
-
- return generic_write_end(file, mapping, pos, len, copied, &folio->page,
- fsdata);
+ return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page);
}
/*
@@ -3986,7 +4020,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret2))
ret = ret2;
@@ -4146,7 +4180,7 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err2 = ext4_mark_inode_dirty(handle, inode);
if (unlikely(err2 && !err))
err = err2;
@@ -4249,7 +4283,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode
}
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
@@ -4858,7 +4892,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
}
}
- EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_GET_CTIME(inode, raw_inode);
EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
@@ -4940,9 +4974,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
"iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
- if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+ if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
if ((err_str = check_igot_inode(inode, flags)) != NULL) {
ext4_error_inode(inode, function, line, 0, err_str);
ret = -EFSCORRUPTED;
@@ -4981,7 +5018,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb,
spin_unlock(&inode->i_lock);
spin_lock(&ei->i_raw_lock);
- EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode);
+ EXT4_INODE_SET_CTIME(inode, raw_inode);
EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode);
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
ext4_inode_csum_set(inode, raw_inode, ei);
@@ -5131,11 +5168,10 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) ||
- sb_rdonly(inode->i_sb))
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (EXT4_SB(inode->i_sb)->s_journal) {
@@ -5255,7 +5291,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (unlikely(IS_IMMUTABLE(inode)))
@@ -5376,10 +5412,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
* Update c/mtime on truncate up, ext4_truncate() will
* update c/mtime in shrink case below
*/
- if (!shrink) {
- inode->i_mtime = current_time(inode);
- inode->i_ctime = inode->i_mtime;
- }
+ if (!shrink)
+ inode->i_mtime = inode_set_ctime_current(inode);
if (shrink)
ext4_fc_track_range(handle, inode,
@@ -5537,7 +5571,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
@@ -5676,7 +5710,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
+ if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
put_bh(iloc->bh);
return -EIO;
}
@@ -5702,7 +5736,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
{
int err;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
err = ext4_get_inode_loc(inode, iloc);
@@ -6140,7 +6174,7 @@ retry_alloc:
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_alloc;
out_ret:
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
out:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 331859511f80..0bfe2ce589e2 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -449,7 +449,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
diff = size - size_bl;
swap_inode_data(inode, inode_bl);
- inode->i_ctime = inode_bl->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
+ inode_set_ctime_current(inode_bl);
inode_inc_iversion(inode);
inode->i_generation = get_random_u32();
@@ -663,7 +664,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
ext4_set_inode_flags(inode, false);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -774,7 +775,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
}
EXT4_I(inode)->i_projid = kprojid;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
out_dirty:
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
@@ -801,7 +802,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
return -EINVAL;
- if (ext4_forced_shutdown(sbi))
+ if (ext4_forced_shutdown(sb))
return 0;
ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
@@ -1266,7 +1267,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
}
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
inode->i_generation = generation;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 21b903fe546e..1e599305d85f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
+#include <linux/freezer.h>
#include <trace/events/ext4.h>
/*
@@ -874,7 +875,7 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context
enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *iter, *grp;
+ struct ext4_group_info *iter;
int i;
if (ac->ac_status == AC_STATUS_FOUND)
@@ -883,7 +884,6 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context
if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
- grp = NULL;
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
if (list_empty(&sbi->s_mb_largest_free_orders[i]))
continue;
@@ -892,28 +892,22 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context
read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
continue;
}
- grp = NULL;
list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
bb_largest_free_order_node) {
if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
- grp = iter;
- break;
+ *group = iter->bb_group;
+ ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
+ read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ return;
}
}
read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
- if (grp)
- break;
}
- if (!grp) {
- /* Increment cr and search again */
- *new_cr = CR_GOAL_LEN_FAST;
- } else {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
- }
+ /* Increment cr and search again if no group is found */
+ *new_cr = CR_GOAL_LEN_FAST;
}
/*
@@ -966,16 +960,25 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
i < MB_NUM_ORDERS(ac->ac_sb); i++) {
grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
- if (grp)
- break;
+ if (grp) {
+ *group = grp->bb_group;
+ ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
+ return;
+ }
}
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
- } else {
+ /*
+ * CR_BEST_AVAIL_LEN works based on the concept that we have
+ * a larger normalized goal len request which can be trimmed to
+ * a smaller goal len such that it can still satisfy original
+ * request len. However, allocation request for non-regular
+ * files never gets normalized.
+ * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
+ */
+ if (ac->ac_flags & EXT4_MB_HINT_DATA)
*new_cr = CR_BEST_AVAIL_LEN;
- }
+ else
+ *new_cr = CR_GOAL_LEN_SLOW;
}
/*
@@ -1051,18 +1054,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
ac->ac_g_ex.fe_len);
grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
- if (grp)
- break;
+ if (grp) {
+ *group = grp->bb_group;
+ ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
+ return;
+ }
}
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
- } else {
- /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
- ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
- *new_cr = CR_GOAL_LEN_SLOW;
- }
+ /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
+ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
+ *new_cr = CR_GOAL_LEN_SLOW;
}
static inline int should_optimize_scan(struct ext4_allocation_context *ac)
@@ -1080,8 +1081,9 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
* Return next linear group for allocation. If linear traversal should not be
* performed, this function just returns the same group
*/
-static int
-next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups)
+static ext4_group_t
+next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group,
+ ext4_group_t ngroups)
{
if (!should_optimize_scan(ac))
goto inc_and_return;
@@ -1255,7 +1257,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
{
ext4_group_t ngroups;
- int blocksize;
+ unsigned int blocksize;
int blocks_per_page;
int groups_per_page;
int err = 0;
@@ -2450,7 +2452,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
break;
}
- if (ac->ac_criteria < CR_FAST) {
+ if (!ext4_mb_cr_expensive(ac->ac_criteria)) {
/*
* In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
* sure that this group will have a large enough
@@ -2553,7 +2555,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return false;
free = grp->bb_free;
@@ -2634,7 +2636,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
free = grp->bb_free;
if (free == 0)
goto out;
- if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len)
+ /*
+ * In all criterias except CR_ANY_FREE we try to avoid groups that
+ * can't possibly satisfy the full goal request due to insufficient
+ * free blocks.
+ */
+ if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len)
goto out;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
goto out;
@@ -2658,7 +2665,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
* sure we locate metadata blocks in the first block group in
* the flex_bg if possible.
*/
- if (cr < CR_FAST &&
+ if (!ext4_mb_cr_expensive(cr) &&
(!sbi->s_log_groups_per_flex ||
((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
!(ext4_has_group_desc_csum(sb) &&
@@ -2787,8 +2794,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/*
* ac->ac_2order is set only if the fe_len is a power of 2
- * if ac->ac_2order is set we also set criteria to 0 so that we
- * try exact allocation using buddy.
+ * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED
+ * so that we try exact allocation using buddy.
*/
i = fls(ac->ac_g_ex.fe_len);
ac->ac_2order = 0;
@@ -2800,10 +2807,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* requests upto maximum buddy size we have constructed.
*/
if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) {
- /*
- * This should tell if fe_len is exactly power of 2
- */
- if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
+ if (is_power_of_2(ac->ac_g_ex.fe_len))
ac->ac_2order = array_index_nospec(i - 1,
MB_NUM_ORDERS(sb));
}
@@ -2848,11 +2852,11 @@ repeat:
/*
* Batch reads of the block allocation bitmaps
* to get multiple READs in flight; limit
- * prefetching at cr=0/1, otherwise mballoc can
- * spend a lot of time loading imperfect groups
+ * prefetching at inexpensive CR, otherwise mballoc
+ * can spend a lot of time loading imperfect groups
*/
if ((prefetch_grp == group) &&
- (cr >= CR_FAST ||
+ (ext4_mb_cr_expensive(cr) ||
prefetch_ios < sbi->s_mb_prefetch_limit)) {
nr = sbi->s_mb_prefetch;
if (ext4_has_feature_flex_bg(sb)) {
@@ -3501,11 +3505,10 @@ static void ext4_discard_work(struct work_struct *work)
struct super_block *sb = sbi->s_sb;
struct ext4_free_data *fd, *nfd;
struct ext4_buddy e4b;
- struct list_head discard_list;
+ LIST_HEAD(discard_list);
ext4_group_t grp, load_grp;
int err = 0;
- INIT_LIST_HEAD(&discard_list);
spin_lock(&sbi->s_md_lock);
list_splice_init(&sbi->s_discard_list, &discard_list);
spin_unlock(&sbi->s_md_lock);
@@ -3879,12 +3882,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_free_data *entry, *tmp;
- struct list_head freed_data_list;
+ LIST_HEAD(freed_data_list);
struct list_head *cut_pos = NULL;
bool wake;
- INIT_LIST_HEAD(&freed_data_list);
-
spin_lock(&sbi->s_md_lock);
list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
if (entry->efd_tid != commit_tid)
@@ -4084,7 +4085,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i, err;
+ int i, err = 0;
int already;
unsigned int clen, clen_changed, thisgrp_len;
@@ -4222,12 +4223,13 @@ ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_
static inline void
ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
- ext4_lblk_t start, ext4_lblk_t end)
+ ext4_lblk_t start, loff_t end)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_prealloc_space *tmp_pa;
- ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ ext4_lblk_t tmp_pa_start;
+ loff_t tmp_pa_end;
struct rb_node *iter;
read_lock(&ei->i_prealloc_lock);
@@ -4236,7 +4238,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
pa_node.inode_node);
tmp_pa_start = tmp_pa->pa_lstart;
- tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ tmp_pa_end = pa_logical_end(sbi, tmp_pa);
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted == 0)
@@ -4258,14 +4260,14 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
*/
static inline void
ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
- ext4_lblk_t *start, ext4_lblk_t *end)
+ ext4_lblk_t *start, loff_t *end)
{
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
struct rb_node *iter;
- ext4_lblk_t new_start, new_end;
- ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1;
+ ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1;
+ loff_t new_end, tmp_pa_end, left_pa_end = -1;
new_start = *start;
new_end = *end;
@@ -4284,7 +4286,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
pa_node.inode_node);
tmp_pa_start = tmp_pa->pa_lstart;
- tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+ tmp_pa_end = pa_logical_end(sbi, tmp_pa);
/* PA must not overlap original request */
spin_lock(&tmp_pa->pa_lock);
@@ -4364,8 +4366,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
}
if (left_pa) {
- left_pa_end =
- left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len);
+ left_pa_end = pa_logical_end(sbi, left_pa);
BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
}
@@ -4404,8 +4405,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_super_block *es = sbi->s_es;
int bsbits, max;
- ext4_lblk_t end;
- loff_t size, start_off;
+ loff_t size, start_off, end;
loff_t orig_size __maybe_unused;
ext4_lblk_t start;
@@ -4432,7 +4432,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ size = extent_logical_end(sbi, &ac->ac_o_ex);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -4766,7 +4766,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL;
- loff_t tmp_pa_end;
struct rb_node *iter;
ext4_fsblk_t goal_block;
@@ -4862,9 +4861,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
* pa can possibly satisfy the request hence check if it overlaps
* original logical start and stop searching if it doesn't.
*/
- tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
-
- if (ac->ac_o_ex.fe_logical >= tmp_pa_end) {
+ if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) {
spin_unlock(&tmp_pa->pa_lock);
goto try_group_pa;
}
@@ -4984,7 +4981,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
n = rb_next(n);
}
- return;
}
/*
@@ -5180,8 +5176,11 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa = ac->ac_pa;
if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
- int new_bex_start;
- int new_bex_end;
+ struct ext4_free_extent ex = {
+ .fe_logical = ac->ac_g_ex.fe_logical,
+ .fe_len = ac->ac_orig_goal_len,
+ };
+ loff_t orig_goal_end = extent_logical_end(sbi, &ex);
/* we can't allocate as much as normalizer wants.
* so, found space must get proper lstart
@@ -5200,29 +5199,23 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
* still cover original start
* 3. Else, keep the best ex at start of original request.
*/
- new_bex_end = ac->ac_g_ex.fe_logical +
- EXT4_C2B(sbi, ac->ac_orig_goal_len);
- new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (ac->ac_o_ex.fe_logical >= new_bex_start)
- goto adjust_bex;
+ ex.fe_len = ac->ac_b_ex.fe_len;
- new_bex_start = ac->ac_g_ex.fe_logical;
- new_bex_end =
- new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (ac->ac_o_ex.fe_logical < new_bex_end)
+ ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
+ if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
goto adjust_bex;
- new_bex_start = ac->ac_o_ex.fe_logical;
- new_bex_end =
- new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ ex.fe_logical = ac->ac_g_ex.fe_logical;
+ if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex))
+ goto adjust_bex;
+ ex.fe_logical = ac->ac_o_ex.fe_logical;
adjust_bex:
- ac->ac_b_ex.fe_logical = new_bex_start;
+ ac->ac_b_ex.fe_logical = ex.fe_logical;
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
- BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
- EXT4_C2B(sbi, ac->ac_orig_goal_len)));
+ BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
}
pa->pa_lstart = ac->ac_b_ex.fe_logical;
@@ -5419,7 +5412,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct buffer_head *bitmap_bh = NULL;
struct ext4_prealloc_space *pa, *tmp;
- struct list_head list;
+ LIST_HEAD(list);
struct ext4_buddy e4b;
struct ext4_inode_info *ei;
int err;
@@ -5448,7 +5441,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
goto out_dbg;
}
- INIT_LIST_HEAD(&list);
ext4_lock_group(sb, group);
list_for_each_entry_safe(pa, tmp,
&grp->bb_prealloc_list, pa_group_list) {
@@ -5529,7 +5521,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
struct buffer_head *bitmap_bh = NULL;
struct ext4_prealloc_space *pa, *tmp;
ext4_group_t group = 0;
- struct list_head list;
+ LIST_HEAD(list);
struct ext4_buddy e4b;
struct rb_node *iter;
int err;
@@ -5546,8 +5538,6 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
trace_ext4_discard_preallocations(inode,
atomic_read(&ei->i_prealloc_active), needed);
- INIT_LIST_HEAD(&list);
-
if (needed == 0)
needed = UINT_MAX;
@@ -5671,7 +5661,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
- if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ if (ext4_forced_shutdown(sb))
return;
ngroups = ext4_get_groups_count(sb);
@@ -5705,7 +5695,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ if (ext4_forced_shutdown(sb))
return;
mb_debug(sb, "Can't allocate:"
@@ -5738,12 +5728,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
#else
static inline void ext4_mb_show_pa(struct super_block *sb)
{
- return;
}
static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
ext4_mb_show_pa(ac->ac_sb);
- return;
}
#endif
@@ -5769,7 +5757,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
group_pa_eligible = sbi->s_mb_group_prealloc > 0;
inode_pa_eligible = true;
- size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ size = extent_logical_end(sbi, &ac->ac_o_ex);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
@@ -5865,13 +5853,11 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
{
ext4_group_t group = 0;
struct ext4_buddy e4b;
- struct list_head discard_list;
+ LIST_HEAD(discard_list);
struct ext4_prealloc_space *pa, *tmp;
mb_debug(sb, "discard locality group preallocation\n");
- INIT_LIST_HEAD(&discard_list);
-
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
pa_node.lg_list,
@@ -5984,12 +5970,9 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
spin_unlock(&lg->lg_prealloc_lock);
/* Now trim the list to be not more than 8 elements */
- if (lg_prealloc_count > 8) {
+ if (lg_prealloc_count > 8)
ext4_mb_discard_lg_preallocations(sb, lg,
order, lg_prealloc_count);
- return;
- }
- return ;
}
/*
@@ -6102,7 +6085,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
ext4_fsblk_t goal, block;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+ struct ext4_super_block *es = sbi->s_es;
goal = ar->goal;
if (goal < le32_to_cpu(es->s_first_data_block) ||
@@ -6643,7 +6626,6 @@ do_more:
error_return:
brelse(bitmap_bh);
ext4_std_error(sb, err);
- return;
}
/**
@@ -6746,7 +6728,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
}
ext4_mb_clear_bb(handle, inode, block, count, flags);
- return;
}
/**
@@ -6926,6 +6907,21 @@ __acquires(bitlock)
return ret;
}
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
+ ext4_group_t grp)
+{
+ if (grp < ext4_get_groups_count(sb))
+ return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
+ return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, grp) - 1) >>
+ EXT4_CLUSTER_BITS(sb);
+}
+
+static bool ext4_trim_interrupted(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
ext4_grpblk_t max, ext4_grpblk_t minblocks)
@@ -6933,11 +6929,13 @@ __acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
ext4_grpblk_t next, count, free_count;
+ bool set_trimmed = false;
void *bitmap;
bitmap = e4b->bd_bitmap;
- start = (e4b->bd_info->bb_first_free > start) ?
- e4b->bd_info->bb_first_free : start;
+ if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
+ set_trimmed = true;
+ start = max(e4b->bd_info->bb_first_free, start);
count = 0;
free_count = 0;
@@ -6951,16 +6949,14 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
int ret = ext4_trim_extent(sb, start, next - start, e4b);
if (ret && ret != -EOPNOTSUPP)
- break;
+ return count;
count += next - start;
}
free_count += next - start;
start = next + 1;
- if (fatal_signal_pending(current)) {
- count = -ERESTARTSYS;
- break;
- }
+ if (ext4_trim_interrupted())
+ return count;
if (need_resched()) {
ext4_unlock_group(sb, e4b->bd_group);
@@ -6972,6 +6968,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
break;
}
+ if (set_trimmed)
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
+
return count;
}
@@ -6982,7 +6981,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
* @start: first group block to examine
* @max: last group block to examine
* @minblocks: minimum extent block count
- * @set_trimmed: set the trimmed flag if at least one block is trimmed
*
* ext4_trim_all_free walks through group's block bitmap searching for free
* extents. When the free extent is found, mark it as used in group buddy
@@ -6992,7 +6990,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
static ext4_grpblk_t
ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
- ext4_grpblk_t minblocks, bool set_trimmed)
+ ext4_grpblk_t minblocks)
{
struct ext4_buddy e4b;
int ret;
@@ -7009,13 +7007,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_lock_group(sb, group);
if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
- minblocks < EXT4_SB(sb)->s_last_trim_minblks) {
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
- if (ret >= 0 && set_trimmed)
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
- } else {
+ else
ret = 0;
- }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -7048,7 +7043,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
ext4_fsblk_t first_data_blk =
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
- bool whole_group, eof = false;
int ret = 0;
start = range->start >> sb->s_blocksize_bits;
@@ -7067,10 +7061,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
- if (end >= max_blks - 1) {
+ if (end >= max_blks - 1)
end = max_blks - 1;
- eof = true;
- }
if (end <= first_data_blk)
goto out;
if (start < first_data_blk)
@@ -7084,9 +7076,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
/* end now represents the last cluster to discard in this group */
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
- whole_group = true;
for (group = first_group; group <= last_group; group++) {
+ if (ext4_trim_interrupted())
+ break;
grp = ext4_get_group_info(sb, group);
if (!grp)
continue;
@@ -7103,13 +7096,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
* change it for the last group, note that last_cluster is
* already computed earlier by ext4_get_group_no_and_offset()
*/
- if (group == last_group) {
+ if (group == last_group)
end = last_cluster;
- whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1;
- }
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen, whole_group);
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
@@ -7154,8 +7145,7 @@ ext4_mballoc_query_range(
ext4_lock_group(sb, group);
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
+ start = max(e4b.bd_info->bb_first_free, start);
if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index df6b5e7c2274..d7aeb5da7d86 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -233,6 +233,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
+static inline loff_t extent_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_free_extent *fex)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len);
+}
+
+static inline loff_t pa_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_prealloc_space *pa)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len);
+}
+
typedef int (*ext4_mballoc_query_range_fn)(
struct super_block *sb,
ext4_group_t agno,
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 0aaf38ffcb6e..bd946d0c71b7 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -162,7 +162,7 @@ static int kmmpd(void *data)
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop() && !sb_rdonly(sb)) {
+ while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) {
if (!ext4_has_feature_mmp(sb)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b5af2fc03b2f..18a9e7c47975 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -340,10 +340,8 @@ again:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
- if ((folio_has_private(folio[0]) &&
- !filemap_release_folio(folio[0], 0)) ||
- (folio_has_private(folio[1]) &&
- !filemap_release_folio(folio[1], 0))) {
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
*err = -EBUSY;
goto drop_data_sem;
}
@@ -362,10 +360,8 @@ data_copy:
/* At this point all buffers in range are uptodate, old mapping layout
* is no longer required, try to drop it now. */
- if ((folio_has_private(folio[0]) &&
- !filemap_release_folio(folio[0], 0)) ||
- (folio_has_private(folio[1]) &&
- !filemap_release_folio(folio[1], 0))) {
+ if (!filemap_release_folio(folio[0], 0) ||
+ !filemap_release_folio(folio[1], 0)) {
*err = -EBUSY;
goto unlock_folios;
}
@@ -392,14 +388,11 @@ data_copy:
for (i = 0; i < block_len_in_page; i++) {
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
if (*err < 0)
- break;
+ goto repair_branches;
bh = bh->b_this_page;
}
- if (!*err)
- *err = block_commit_write(&folio[0]->page, from, from + replaced_size);
- if (unlikely(*err < 0))
- goto repair_branches;
+ block_commit_write(&folio[0]->page, from, from + replaced_size);
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0caf6c730ce3..bbda587f76b8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -343,17 +343,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
struct ext4_dir_entry *d, *top;
d = (struct ext4_dir_entry *)bh->b_data;
top = (struct ext4_dir_entry *)(bh->b_data +
- (EXT4_BLOCK_SIZE(inode->i_sb) -
- sizeof(struct ext4_dir_entry_tail)));
- while (d < top && d->rec_len)
+ (blocksize - sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
d = (struct ext4_dir_entry *)(((void *)d) +
- le16_to_cpu(d->rec_len));
+ ext4_rec_len_from_disk(d->rec_len, blocksize));
if (d != top)
return NULL;
@@ -364,7 +364,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
#endif
if (t->det_reserved_zero1 ||
- le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
+ sizeof(struct ext4_dir_entry_tail)) ||
t->det_reserved_zero2 ||
t->det_reserved_ft != EXT4_FT_DIR_CSUM)
return NULL;
@@ -445,13 +446,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
struct ext4_dir_entry *dp;
struct dx_root_info *root;
int count_offset;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
+ unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
- if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ if (rlen == blocksize)
count_offset = 8;
- else if (le16_to_cpu(dirent->rec_len) == 12) {
+ else if (rlen == 12) {
dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
- if (le16_to_cpu(dp->rec_len) !=
- EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
return NULL;
root = (struct dx_root_info *)(((void *)dp + 12));
if (root->reserved_zero ||
@@ -1315,6 +1317,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
unsigned int buflen = bh->b_size;
char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
+ int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
if (ext4_has_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
@@ -1335,11 +1338,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
- map_tail->size = le16_to_cpu(de->rec_len);
+ map_tail->size = ext4_rec_len_from_disk(de->rec_len,
+ blocksize);
count++;
cond_resched();
}
- de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ de = ext4_next_entry(de, blocksize);
}
return count;
}
@@ -1445,7 +1449,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
struct dx_hash_info *hinfo = &name->hinfo;
int len;
- if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding ||
+ if (!IS_CASEFOLDED(dir) ||
(IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) {
cf_name->name = NULL;
return 0;
@@ -1496,7 +1500,7 @@ static bool ext4_match(struct inode *parent,
#endif
#if IS_ENABLED(CONFIG_UNICODE)
- if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) &&
+ if (IS_CASEFOLDED(parent) &&
(!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) {
if (fname->cf_name.name) {
struct qstr cf = {.name = fname->cf_name.name,
@@ -2203,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
* happen is that the times are slightly out of date
* and/or different from the directory change time.
*/
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
inode_inc_iversion(dir);
err2 = ext4_mark_inode_dirty(handle, dir);
@@ -2393,7 +2397,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
#if IS_ENABLED(CONFIG_UNICODE)
if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
- sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name))
+ utf8_validate(sb->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
@@ -2799,6 +2803,7 @@ static int ext4_add_nondir(handle_t *handle,
return err;
}
drop_nlink(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
return err;
@@ -3142,7 +3147,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
/* Initialize quotas before so that eventual writes go in
@@ -3197,7 +3202,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
* recovery. */
inode->i_size = 0;
ext4_orphan_add(handle, inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_current(dir);
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (retval)
goto end_rmdir;
@@ -3271,7 +3277,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto out_handle;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
ext4_update_dx_flag(dir);
retval = ext4_mark_inode_dirty(handle, dir);
if (retval)
@@ -3286,7 +3292,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
retval = ext4_mark_inode_dirty(handle, inode);
if (dentry && !retval)
ext4_fc_track_unlink(handle, dentry);
@@ -3301,7 +3307,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
trace_ext4_unlink_enter(dir, dentry);
@@ -3369,7 +3375,7 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct fscrypt_str disk_link;
int retries = 0;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(dir->i_sb)))
return -EIO;
err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
@@ -3436,6 +3442,7 @@ retry:
err_drop_inode:
clear_nlink(inode);
+ ext4_mark_inode_dirty(handle, inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
if (handle)
@@ -3463,7 +3470,7 @@ retry:
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ext4_inc_count(inode);
ihold(inode);
@@ -3641,8 +3648,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
inode_inc_iversion(ent->dir);
- ent->dir->i_ctime = ent->dir->i_mtime =
- current_time(ent->dir);
+ ent->dir->i_mtime = inode_set_ctime_current(ent->dir);
retval = ext4_mark_inode_dirty(handle, ent->dir);
BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
if (!ent->inlined) {
@@ -3941,7 +3947,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old.inode->i_ctime = current_time(old.inode);
+ inode_set_ctime_current(old.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
@@ -3955,9 +3961,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new.inode) {
ext4_dec_count(new.inode);
- new.inode->i_ctime = current_time(new.inode);
+ inode_set_ctime_current(new.inode);
}
- old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
+ old.dir->i_mtime = inode_set_ctime_current(old.dir);
ext4_update_dx_flag(old.dir);
if (old.dir_bh) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
@@ -4021,6 +4027,7 @@ end_rename:
ext4_resetent(handle, &old,
old.inode->i_ino, old_file_type);
drop_nlink(whiteout);
+ ext4_mark_inode_dirty(handle, whiteout);
ext4_orphan_add(handle, whiteout);
}
unlock_new_inode(whiteout);
@@ -4053,7 +4060,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
};
u8 new_file_type;
int retval;
- struct timespec64 ctime;
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
!projid_eq(EXT4_I(new_dir)->i_projid,
@@ -4147,9 +4153,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- ctime = current_time(old.inode);
- old.inode->i_ctime = ctime;
- new.inode->i_ctime = ctime;
+ inode_set_ctime_current(old.inode);
+ inode_set_ctime_current(new.inode);
retval = ext4_mark_inode_dirty(handle, old.inode);
if (unlikely(retval))
goto end_rename;
@@ -4189,7 +4194,7 @@ static int ext4_rename2(struct mnt_idmap *idmap,
{
int err;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb))))
+ if (unlikely(ext4_forced_shutdown(old_dir->i_sb)))
return -EIO;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 3621f29ec671..dfdd7e5cf038 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -184,7 +184,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
io_end->handle = NULL; /* Following call will use up the handle */
ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
- if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
+ if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
ext4_msg(inode->i_sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c94ebf704616..dbebd8b3127e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -93,6 +93,7 @@ static int ext4_get_tree(struct fs_context *fc);
static int ext4_reconfigure(struct fs_context *fc);
static void ext4_fc_free(struct fs_context *fc);
static int ext4_init_fs_context(struct fs_context *fc);
+static void ext4_kill_sb(struct super_block *sb);
static const struct fs_parameter_spec ext4_param_specs[];
/*
@@ -135,12 +136,12 @@ static struct file_system_type ext2_fs_type = {
.name = "ext2",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
+ .kill_sb = ext4_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext2");
MODULE_ALIAS("ext2");
-#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
+#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type)
#else
#define IS_EXT2_SB(sb) (0)
#endif
@@ -151,12 +152,12 @@ static struct file_system_type ext3_fs_type = {
.name = "ext3",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
+ .kill_sb = ext4_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ext3");
MODULE_ALIAS("ext3");
-#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
+#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type)
static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
@@ -433,6 +434,57 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
+#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
+#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */
+
+/*
+ * The ext4_maybe_update_superblock() function checks and updates the
+ * superblock if needed.
+ *
+ * This function is designed to update the on-disk superblock only under
+ * certain conditions to prevent excessive disk writes and unnecessary
+ * waking of the disk from sleep. The superblock will be updated if:
+ * 1. More than an hour has passed since the last superblock update, and
+ * 2. More than 16MB have been written since the last superblock update.
+ *
+ * @sb: The superblock
+ */
+static void ext4_maybe_update_superblock(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ journal_t *journal = sbi->s_journal;
+ time64_t now;
+ __u64 last_update;
+ __u64 lifetime_write_kbytes;
+ __u64 diff_size;
+
+ if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
+ !journal || (journal->j_flags & JBD2_UNMOUNT))
+ return;
+
+ now = ktime_get_real_seconds();
+ last_update = ext4_get_tstamp(es, s_wtime);
+
+ if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
+ return;
+
+ lifetime_write_kbytes = sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) -
+ sbi->s_sectors_written_start) >> 1);
+
+ /* Get the number of kilobytes not written to disk to account
+ * for statistics and compare with a multiple of 16 MB. This
+ * is used to determine when the next superblock commit should
+ * occur (i.e. not more often than once per 16MB if there was
+ * less written in an hour).
+ */
+ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
+
+ if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
+}
+
/*
* The del_gendisk() function uninitializes the disk-specific data
* structures, including the bdi structure, without telling anyone
@@ -459,6 +511,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
BUG_ON(txn->t_state == T_FINISHED);
ext4_process_freed_data(sb, txn->t_tid);
+ ext4_maybe_update_superblock(sb);
spin_lock(&sbi->s_md_lock);
while (!list_empty(&txn->t_private_list)) {
@@ -657,7 +710,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
WARN_ON_ONCE(1);
if (!continue_fs && !sb_rdonly(sb)) {
- ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
+ set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
if (journal)
jbd2_journal_abort(journal, -EIO);
}
@@ -671,7 +724,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
* defer superblock flushing to a workqueue.
*/
if (continue_fs && journal)
- schedule_work(&EXT4_SB(sb)->s_error_work);
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
else
ext4_commit_super(sb);
}
@@ -698,10 +751,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
sb->s_flags |= SB_RDONLY;
}
-static void flush_stashed_error_work(struct work_struct *work)
+static void update_super_work(struct work_struct *work)
{
struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
- s_error_work);
+ s_sb_upd_work);
journal_t *journal = sbi->s_journal;
handle_t *handle;
@@ -715,6 +768,7 @@ static void flush_stashed_error_work(struct work_struct *work)
*/
if (!sb_rdonly(sbi->s_sb) && journal) {
struct buffer_head *sbh = sbi->s_sbh;
+ bool call_notify_err;
handle = jbd2_journal_start(journal, 1);
if (IS_ERR(handle))
goto write_directly;
@@ -722,6 +776,10 @@ static void flush_stashed_error_work(struct work_struct *work)
jbd2_journal_stop(handle);
goto write_directly;
}
+
+ if (sbi->s_add_error_count > 0)
+ call_notify_err = true;
+
ext4_update_super(sbi->s_sb);
if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to "
@@ -735,7 +793,10 @@ static void flush_stashed_error_work(struct work_struct *work)
goto write_directly;
}
jbd2_journal_stop(handle);
- ext4_notify_error_sysfs(sbi);
+
+ if (call_notify_err)
+ ext4_notify_error_sysfs(sbi);
+
return;
}
write_directly:
@@ -758,7 +819,7 @@ void __ext4_error(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -783,7 +844,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
va_list args;
struct va_format vaf;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -818,7 +879,7 @@ void __ext4_error_file(struct file *file, const char *function,
struct inode *inode = file_inode(file);
char pathname[80], *path;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -898,7 +959,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
char nbuf[16];
const char *errstr;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return;
/* Special case: if the error is EROFS, and we're not already
@@ -992,7 +1053,7 @@ __acquires(bitlock)
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+ if (unlikely(ext4_forced_shutdown(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -1018,7 +1079,7 @@ __acquires(bitlock)
if (!bdev_read_only(sb->s_bdev)) {
save_error_info(sb, EFSCORRUPTED, ino, block, function,
line);
- schedule_work(&EXT4_SB(sb)->s_error_work);
+ schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
return;
}
@@ -1096,54 +1157,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
-static void ext4_bdev_mark_dead(struct block_device *bdev)
-{
- ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH);
-}
-
-static const struct blk_holder_ops ext4_holder_ops = {
- .mark_dead = ext4_bdev_mark_dead,
-};
-
-/*
- * Open the external journal device
- */
-static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
-{
- struct block_device *bdev;
-
- bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
- &ext4_holder_ops);
- if (IS_ERR(bdev))
- goto fail;
- return bdev;
-
-fail:
- ext4_msg(sb, KERN_ERR,
- "failed to open journal device unknown-block(%u,%u) %ld",
- MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
- return NULL;
-}
-
-/*
- * Release the journal device
- */
-static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
-{
- struct block_device *bdev;
- bdev = sbi->s_journal_bdev;
- if (bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
- invalidate_bdev(bdev);
- blkdev_put(bdev, sbi->s_sb);
- sbi->s_journal_bdev = NULL;
- }
-}
-
static inline struct inode *orphan_list_entry(struct list_head *l)
{
return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
@@ -1278,10 +1291,10 @@ static void ext4_put_super(struct super_block *sb)
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
- * Unregister sysfs before flush sbi->s_error_work.
+ * Unregister sysfs before flush sbi->s_sb_upd_work.
* Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
* read metadata verify failed then will queue error work.
- * flush_stashed_error_work will call start_this_handle may trigger
+ * update_super_work will call start_this_handle may trigger
* BUG_ON.
*/
ext4_unregister_sysfs(sb);
@@ -1293,7 +1306,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
ext4_quotas_off(sb, EXT4_MAXQUOTAS);
- flush_work(&sbi->s_error_work);
+ flush_work(&sbi->s_sb_upd_work);
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
@@ -1339,8 +1352,13 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
if (sbi->s_journal_bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
sync_blockdev(sbi->s_journal_bdev);
- ext4_blkdev_remove(sbi);
+ invalidate_bdev(sbi->s_journal_bdev);
}
ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1897,6 +1915,7 @@ static const struct mount_opts {
{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
#endif
+ {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2},
{Opt_err, 0, 0}
};
@@ -1965,8 +1984,6 @@ struct ext4_fs_context {
unsigned int mask_s_mount_opt;
unsigned int vals_s_mount_opt2;
unsigned int mask_s_mount_opt2;
- unsigned long vals_s_mount_flags;
- unsigned long mask_s_mount_flags;
unsigned int opt_flags; /* MOPT flags */
unsigned int spec;
u32 s_max_batch_time;
@@ -2117,12 +2134,6 @@ EXT4_SET_CTX(mount_opt2);
EXT4_CLEAR_CTX(mount_opt2);
EXT4_TEST_CTX(mount_opt2);
-static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
-{
- set_bit(bit, &ctx->mask_s_mount_flags);
- set_bit(bit, &ctx->vals_s_mount_flags);
-}
-
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct ext4_fs_context *ctx = fc->fs_private;
@@ -2186,9 +2197,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option",
param->key);
return 0;
- case Opt_abort:
- ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
- return 0;
case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
ctx_set_flags(ctx, SB_INLINECRYPT);
@@ -2842,8 +2850,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb)
sbi->s_mount_opt |= ctx->vals_s_mount_opt;
sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2;
sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2;
- sbi->s_mount_flags &= ~ctx->mask_s_mount_flags;
- sbi->s_mount_flags |= ctx->vals_s_mount_flags;
sb->s_flags &= ~ctx->mask_s_flags;
sb->s_flags |= ctx->vals_s_flags;
@@ -4232,7 +4238,7 @@ int ext4_calculate_overhead(struct super_block *sb)
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
- if (j_inode) {
+ if (!IS_ERR(j_inode)) {
j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
overhead += EXT4_NUM_B2C(sbi, j_blocks);
iput(j_inode);
@@ -4970,8 +4976,8 @@ static int ext4_load_and_init_journal(struct super_block *sb,
return 0;
out:
- /* flush s_error_work before journal destroy. */
- flush_work(&sbi->s_error_work);
+ /* flush s_sb_upd_work before destroying the journal. */
+ flush_work(&sbi->s_sb_upd_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
return -EINVAL;
@@ -5294,7 +5300,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
spin_lock_init(&sbi->s_error_lock);
- INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
+ INIT_WORK(&sbi->s_sb_upd_work, update_super_work);
err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
if (err)
@@ -5572,7 +5578,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
spin_lock_init(&sbi->s_bdev_wb_lock);
errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
&sbi->s_bdev_wb_err);
- sb->s_bdev->bd_super = sb;
EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
@@ -5638,16 +5643,16 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
- /* flush s_error_work before journal destroy. */
- flush_work(&sbi->s_error_work);
+ /* flush s_sb_upd_work before journal destroy. */
+ flush_work(&sbi->s_sb_upd_work);
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
- /* flush s_error_work before sbi destroy */
- flush_work(&sbi->s_error_work);
+ /* flush s_sb_upd_work before sbi destroy */
+ flush_work(&sbi->s_sb_upd_work);
del_timer_sync(&sbi->s_err_report);
ext4_stop_mmpd(sbi);
ext4_group_desc_free(sbi);
@@ -5664,9 +5669,11 @@ failed_mount:
kfree(get_qf_name(sb, sbi, i));
#endif
fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
- /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */
brelse(sbi->s_sbh);
- ext4_blkdev_remove(sbi);
+ if (sbi->s_journal_bdev) {
+ invalidate_bdev(sbi->s_journal_bdev);
+ blkdev_put(sbi->s_journal_bdev, sb);
+ }
out_fail:
invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
@@ -5772,22 +5779,22 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
if (IS_ERR(journal_inode)) {
ext4_msg(sb, KERN_ERR, "no journal found");
- return NULL;
+ return ERR_CAST(journal_inode);
}
if (!journal_inode->i_nlink) {
make_bad_inode(journal_inode);
iput(journal_inode);
ext4_msg(sb, KERN_ERR, "journal inode is deleted");
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
}
-
- ext4_debug("Journal inode found at %p: %lld bytes\n",
- journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
}
+
+ ext4_debug("Journal inode found at %p: %lld bytes\n",
+ journal_inode, journal_inode->i_size);
return journal_inode;
}
@@ -5813,24 +5820,21 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block)
return 0;
}
-static journal_t *ext4_get_journal(struct super_block *sb,
- unsigned int journal_inum)
+static journal_t *ext4_open_inode_journal(struct super_block *sb,
+ unsigned int journal_inum)
{
struct inode *journal_inode;
journal_t *journal;
- if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
- return NULL;
-
journal_inode = ext4_get_journal_inode(sb, journal_inum);
- if (!journal_inode)
- return NULL;
+ if (IS_ERR(journal_inode))
+ return ERR_CAST(journal_inode);
journal = jbd2_journal_init_inode(journal_inode);
- if (!journal) {
+ if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "Could not load journal inode");
iput(journal_inode);
- return NULL;
+ return ERR_CAST(journal);
}
journal->j_private = sb;
journal->j_bmap = ext4_journal_bmap;
@@ -5838,40 +5842,47 @@ static journal_t *ext4_get_journal(struct super_block *sb,
return journal;
}
-static journal_t *ext4_get_dev_journal(struct super_block *sb,
- dev_t j_dev)
+static struct block_device *ext4_get_journal_blkdev(struct super_block *sb,
+ dev_t j_dev, ext4_fsblk_t *j_start,
+ ext4_fsblk_t *j_len)
{
struct buffer_head *bh;
- journal_t *journal;
- ext4_fsblk_t start;
- ext4_fsblk_t len;
+ struct block_device *bdev;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
struct ext4_super_block *es;
- struct block_device *bdev;
-
- if (WARN_ON_ONCE(!ext4_has_feature_journal(sb)))
- return NULL;
+ int errno;
- bdev = ext4_blkdev_get(j_dev, sb);
- if (bdev == NULL)
- return NULL;
+ /* see get_tree_bdev why this is needed and safe */
+ up_write(&sb->s_umount);
+ bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb,
+ &fs_holder_ops);
+ down_write(&sb->s_umount);
+ if (IS_ERR(bdev)) {
+ ext4_msg(sb, KERN_ERR,
+ "failed to open journal device unknown-block(%u,%u) %ld",
+ MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev));
+ return ERR_CAST(bdev);
+ }
blocksize = sb->s_blocksize;
hblock = bdev_logical_block_size(bdev);
if (blocksize < hblock) {
ext4_msg(sb, KERN_ERR,
"blocksize too small for journal device");
+ errno = -EINVAL;
goto out_bdev;
}
sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
offset = EXT4_MIN_BLOCK_SIZE % blocksize;
set_blocksize(bdev, blocksize);
- if (!(bh = __bread(bdev, sb_block, blocksize))) {
+ bh = __bread(bdev, sb_block, blocksize);
+ if (!bh) {
ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
"external journal");
+ errno = -EINVAL;
goto out_bdev;
}
@@ -5879,57 +5890,74 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
!(le32_to_cpu(es->s_feature_incompat) &
EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
- ext4_msg(sb, KERN_ERR, "external journal has "
- "bad superblock");
- brelse(bh);
- goto out_bdev;
+ ext4_msg(sb, KERN_ERR, "external journal has bad superblock");
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
if ((le32_to_cpu(es->s_feature_ro_compat) &
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
es->s_checksum != ext4_superblock_csum(sb, es)) {
- ext4_msg(sb, KERN_ERR, "external journal has "
- "corrupt superblock");
- brelse(bh);
- goto out_bdev;
+ ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
ext4_msg(sb, KERN_ERR, "journal UUID does not match");
- brelse(bh);
- goto out_bdev;
+ errno = -EFSCORRUPTED;
+ goto out_bh;
}
- len = ext4_blocks_count(es);
- start = sb_block + 1;
- brelse(bh); /* we're done with the superblock */
+ *j_start = sb_block + 1;
+ *j_len = ext4_blocks_count(es);
+ brelse(bh);
+ return bdev;
+
+out_bh:
+ brelse(bh);
+out_bdev:
+ blkdev_put(bdev, sb);
+ return ERR_PTR(errno);
+}
- journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
- start, len, blocksize);
- if (!journal) {
+static journal_t *ext4_open_dev_journal(struct super_block *sb,
+ dev_t j_dev)
+{
+ journal_t *journal;
+ ext4_fsblk_t j_start;
+ ext4_fsblk_t j_len;
+ struct block_device *journal_bdev;
+ int errno = 0;
+
+ journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+ if (IS_ERR(journal_bdev))
+ return ERR_CAST(journal_bdev);
+
+ journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start,
+ j_len, sb->s_blocksize);
+ if (IS_ERR(journal)) {
ext4_msg(sb, KERN_ERR, "failed to create device journal");
+ errno = PTR_ERR(journal);
goto out_bdev;
}
- journal->j_private = sb;
- if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) {
- ext4_msg(sb, KERN_ERR, "I/O error on journal device");
- goto out_journal;
- }
if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
ext4_msg(sb, KERN_ERR, "External journal has more than one "
"user (unsupported) - %d",
be32_to_cpu(journal->j_superblock->s_nr_users));
+ errno = -EINVAL;
goto out_journal;
}
- EXT4_SB(sb)->s_journal_bdev = bdev;
+ journal->j_private = sb;
+ EXT4_SB(sb)->s_journal_bdev = journal_bdev;
ext4_init_journal_params(sb, journal);
return journal;
out_journal:
jbd2_journal_destroy(journal);
out_bdev:
- blkdev_put(bdev, sb);
- return NULL;
+ blkdev_put(journal_bdev, sb);
+ return ERR_PTR(errno);
}
static int ext4_load_journal(struct super_block *sb,
@@ -5961,13 +5989,13 @@ static int ext4_load_journal(struct super_block *sb,
}
if (journal_inum) {
- journal = ext4_get_journal(sb, journal_inum);
- if (!journal)
- return -EINVAL;
+ journal = ext4_open_inode_journal(sb, journal_inum);
+ if (IS_ERR(journal))
+ return PTR_ERR(journal);
} else {
- journal = ext4_get_dev_journal(sb, journal_dev);
- if (!journal)
- return -EINVAL;
+ journal = ext4_open_dev_journal(sb, journal_dev);
+ if (IS_ERR(journal))
+ return PTR_ERR(journal);
}
journal_dev_ro = bdev_read_only(journal->j_dev);
@@ -6084,7 +6112,7 @@ static void ext4_update_super(struct super_block *sb)
* the clock is set in the future, and this will cause e2fsck
* to complain and force a full file system check.
*/
- if (!(sb->s_flags & SB_RDONLY))
+ if (!sb_rdonly(sb))
ext4_update_tstamp(es, s_wtime);
es->s_kbytes_written =
cpu_to_le64(sbi->s_kbytes_written +
@@ -6282,13 +6310,7 @@ static int ext4_clear_journal_err(struct super_block *sb,
*/
int ext4_force_commit(struct super_block *sb)
{
- journal_t *journal;
-
- if (sb_rdonly(sb))
- return 0;
-
- journal = EXT4_SB(sb)->s_journal;
- return ext4_journal_force_commit(journal);
+ return ext4_journal_force_commit(EXT4_SB(sb)->s_journal);
}
static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -6298,7 +6320,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sbi)))
+ if (unlikely(ext4_forced_shutdown(sb)))
return 0;
trace_ext4_sync_fs(sb, wait);
@@ -6347,12 +6369,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
static int ext4_freeze(struct super_block *sb)
{
int error = 0;
- journal_t *journal;
-
- if (sb_rdonly(sb))
- return 0;
-
- journal = EXT4_SB(sb)->s_journal;
+ journal_t *journal = EXT4_SB(sb)->s_journal;
if (journal) {
/* Now we set up the journal barrier. */
@@ -6386,7 +6403,7 @@ out:
*/
static int ext4_unfreeze(struct super_block *sb)
{
- if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
+ if (ext4_forced_shutdown(sb))
return 0;
if (EXT4_SB(sb)->s_journal) {
@@ -6502,7 +6519,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
goto restore_opts;
}
- if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED))
+ if (test_opt2(sb, ABORT))
ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
@@ -6516,10 +6533,10 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
}
/* Flush outstanding errors before changing fs state */
- flush_work(&sbi->s_error_work);
+ flush_work(&sbi->s_sb_upd_work);
if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) {
+ if (ext4_forced_shutdown(sb)) {
err = -EROFS;
goto restore_opts;
}
@@ -6680,7 +6697,7 @@ restore_opts:
* If there was a failing r/w to ro transition, we may need to
* re-enable quota
*/
- if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
+ if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) &&
sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
sb->s_flags = old_sb_flags;
@@ -7089,6 +7106,13 @@ static int ext4_quota_off(struct super_block *sb, int type)
err = dquot_quota_off(sb, type);
if (err || ext4_has_feature_quota(sb))
goto out_put;
+ /*
+ * When the filesystem was remounted read-only first, we cannot cleanup
+ * inode flags here. Bad luck but people should be using QUOTA feature
+ * these days anyway.
+ */
+ if (sb_rdonly(sb))
+ goto out_put;
inode_lock(inode);
/*
@@ -7103,7 +7127,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
}
EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
err = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
out_unlock:
@@ -7273,12 +7297,23 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
return 1;
}
+static void ext4_kill_sb(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL;
+
+ kill_block_super(sb);
+
+ if (journal_bdev)
+ blkdev_put(journal_bdev, sb);
+}
+
static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
.name = "ext4",
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
- .kill_sb = kill_block_super,
+ .kill_sb = ext4_kill_sb,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 05151d61b00b..92ba28cebac6 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -356,13 +356,13 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
{
- return ((u64)ea_inode->i_ctime.tv_sec << 32) |
+ return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) |
(u32) inode_peek_iversion_raw(ea_inode);
}
static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count)
{
- ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32);
+ inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0);
inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff);
}
@@ -701,7 +701,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
{
int error;
- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
return -EIO;
if (strlen(name) > 255)
@@ -2473,7 +2473,7 @@ retry_inode:
}
if (!error) {
ext4_xattr_update_super_block(handle, inode->i_sb);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_iversion(inode);
if (!value)
no_expand = 0;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 03ef087537c7..68a1e23e1557 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -2,6 +2,7 @@
config F2FS_FS
tristate "F2FS filesystem support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select CRYPTO
select CRYPTO_CRC32
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 8fd3b7f9fb88..b0597a539fc5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -1701,9 +1701,9 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
}
f2fs_restore_inmem_curseg(sbi);
+ stat_inc_cp_count(sbi);
stop:
unblock_operations(sbi);
- stat_inc_cp_count(sbi->stat_info);
if (cpc->reason & CP_RECOVERY)
f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 236d890f560b..d820801f473e 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -649,13 +649,8 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
goto destroy_compress_ctx;
}
- for (i = 0; i < cc->nr_cpages; i++) {
+ for (i = 0; i < cc->nr_cpages; i++)
cc->cpages[i] = f2fs_compress_alloc_page();
- if (!cc->cpages[i]) {
- ret = -ENOMEM;
- goto out_free_cpages;
- }
- }
cc->rbuf = f2fs_vmap(cc->rpages, cc->cluster_size);
if (!cc->rbuf) {
@@ -1045,7 +1040,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
struct address_space *mapping = cc->inode->i_mapping;
struct page *page;
sector_t last_block_in_bio;
- unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
+ fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
pgoff_t start_idx = start_idx_of_cluster(cc);
int i, ret;
@@ -1574,8 +1569,6 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
}
dic->tpages[i] = f2fs_compress_alloc_page();
- if (!dic->tpages[i])
- return -ENOMEM;
}
dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size);
@@ -1656,11 +1649,6 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
struct page *page;
page = f2fs_compress_alloc_page();
- if (!page) {
- ret = -ENOMEM;
- goto out_free;
- }
-
f2fs_set_compressed_page(page, cc->inode,
start_idx + i + 1, dic);
dic->cpages[i] = page;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 5882afe71d82..916e317ac925 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1167,6 +1167,9 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
f2fs_wait_on_block_writeback(inode, blkaddr);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ iostat_update_and_unbind_ctx(bio);
+ if (bio->bi_private)
+ mempool_free(bio->bi_private, bio_post_read_ctx_pool);
bio_put(bio);
return -EFAULT;
}
@@ -1389,18 +1392,14 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
-repeat:
+
page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
if (IS_ERR(page))
return page;
/* wait for read completion */
lock_page(page);
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
- goto repeat;
- }
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(page->mapping != mapping || !PageUptodate(page))) {
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -3236,8 +3235,7 @@ result:
}
goto next;
}
- done_index = folio->index +
- folio_nr_pages(folio);
+ done_index = folio_next_index(folio);
done = 1;
break;
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 61c35b59126e..fdbf994f1271 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -215,6 +215,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->valid_blks[type] += blks;
}
+ for (i = 0; i < MAX_CALL_TYPE; i++)
+ si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]);
+
for (i = 0; i < 2; i++) {
si->segment_count[i] = sbi->segment_count[i];
si->block_count[i] = sbi->block_count[i];
@@ -497,7 +500,9 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
seq_printf(s, "CP calls: %d (BG: %d)\n",
- si->cp_count, si->bg_cp_count);
+ si->cp_call_count[TOTAL_CALL],
+ si->cp_call_count[BACKGROUND]);
+ seq_printf(s, "CP count: %d\n", si->cp_count);
seq_printf(s, " - cp blocks : %u\n", si->meta_count[META_CP]);
seq_printf(s, " - sit blocks : %u\n",
si->meta_count[META_SIT]);
@@ -511,12 +516,24 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt);
seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time);
seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time);
- seq_printf(s, "GC calls: %d (BG: %d)\n",
- si->call_count, si->bg_gc);
- seq_printf(s, " - data segments : %d (%d)\n",
- si->data_segs, si->bg_data_segs);
- seq_printf(s, " - node segments : %d (%d)\n",
- si->node_segs, si->bg_node_segs);
+ seq_printf(s, "GC calls: %d (gc_thread: %d)\n",
+ si->gc_call_count[BACKGROUND] +
+ si->gc_call_count[FOREGROUND],
+ si->gc_call_count[BACKGROUND]);
+ if (__is_large_section(sbi)) {
+ seq_printf(s, " - data sections : %d (BG: %d)\n",
+ si->gc_secs[DATA][BG_GC] + si->gc_secs[DATA][FG_GC],
+ si->gc_secs[DATA][BG_GC]);
+ seq_printf(s, " - node sections : %d (BG: %d)\n",
+ si->gc_secs[NODE][BG_GC] + si->gc_secs[NODE][FG_GC],
+ si->gc_secs[NODE][BG_GC]);
+ }
+ seq_printf(s, " - data segments : %d (BG: %d)\n",
+ si->gc_segs[DATA][BG_GC] + si->gc_segs[DATA][FG_GC],
+ si->gc_segs[DATA][BG_GC]);
+ seq_printf(s, " - node segments : %d (BG: %d)\n",
+ si->gc_segs[NODE][BG_GC] + si->gc_segs[NODE][FG_GC],
+ si->gc_segs[NODE][BG_GC]);
seq_puts(s, " - Reclaimed segs :\n");
seq_printf(s, " - Normal : %d\n", sbi->gc_reclaimed_segs[GC_NORMAL]);
seq_printf(s, " - Idle CB : %d\n", sbi->gc_reclaimed_segs[GC_IDLE_CB]);
@@ -687,6 +704,8 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inplace_count, 0);
for (i = META_CP; i < META_MAX; i++)
atomic_set(&sbi->meta_count[i], 0);
+ for (i = 0; i < MAX_CALL_TYPE; i++)
+ atomic_set(&sbi->cp_call_count[i], 0);
atomic_set(&sbi->max_aw_cnt, 0);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index d635c58cf5a3..8aa29fe2e87b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
de->file_type = fs_umode_to_ftype(inode->i_mode);
set_page_dirty(page);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
f2fs_put_page(page, 1);
}
@@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
f2fs_i_links_write(dir, true);
clear_inode_flag(inode, FI_NEW_INODE);
}
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
if (F2FS_I(dir)->i_current_depth != current_depth)
@@ -858,7 +858,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
if (S_ISDIR(inode->i_mode))
f2fs_i_links_write(dir, false);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_i_links_write(inode, false);
if (S_ISDIR(inode->i_mode)) {
@@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
}
f2fs_put_page(page, 1);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c7cb2177b252..6d688e42d89c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1383,6 +1383,13 @@ enum errors_option {
MOUNT_ERRORS_PANIC, /* panic on errors */
};
+enum {
+ BACKGROUND,
+ FOREGROUND,
+ MAX_CALL_TYPE,
+ TOTAL_CALL = FOREGROUND,
+};
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -1695,6 +1702,7 @@ struct f2fs_sb_info {
unsigned int io_skip_bggc; /* skip background gc for in-flight IO */
unsigned int other_skip_bggc; /* skip background gc for other reasons */
unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */
+ atomic_t cp_call_count[MAX_CALL_TYPE]; /* # of cp call */
#endif
spinlock_t stat_lock; /* lock for stat operations */
@@ -2114,15 +2122,6 @@ static inline int f2fs_down_read_trylock(struct f2fs_rwsem *sem)
return down_read_trylock(&sem->internal_rwsem);
}
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass)
-{
- down_read_nested(&sem->internal_rwsem, subclass);
-}
-#else
-#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem)
-#endif
-
static inline void f2fs_up_read(struct f2fs_rwsem *sem)
{
up_read(&sem->internal_rwsem);
@@ -2133,6 +2132,21 @@ static inline void f2fs_down_write(struct f2fs_rwsem *sem)
down_write(&sem->internal_rwsem);
}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void f2fs_down_read_nested(struct f2fs_rwsem *sem, int subclass)
+{
+ down_read_nested(&sem->internal_rwsem, subclass);
+}
+
+static inline void f2fs_down_write_nested(struct f2fs_rwsem *sem, int subclass)
+{
+ down_write_nested(&sem->internal_rwsem, subclass);
+}
+#else
+#define f2fs_down_read_nested(sem, subclass) f2fs_down_read(sem)
+#define f2fs_down_write_nested(sem, subclass) f2fs_down_write(sem)
+#endif
+
static inline int f2fs_down_write_trylock(struct f2fs_rwsem *sem)
{
return down_write_trylock(&sem->internal_rwsem);
@@ -2736,7 +2750,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
static inline struct page *f2fs_pagecache_get_page(
struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp_mask)
+ fgf_t fgp_flags, gfp_t gfp_mask)
{
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET))
return NULL;
@@ -3303,9 +3317,11 @@ static inline void clear_file(struct inode *inode, int type)
static inline bool f2fs_is_time_consistent(struct inode *inode)
{
+ struct timespec64 ctime = inode_get_ctime(inode);
+
if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime))
return false;
if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
return false;
@@ -3885,7 +3901,7 @@ struct f2fs_stat_info {
int nats, dirty_nats, sits, dirty_sits;
int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
- int bg_gc, nr_wb_cp_data, nr_wb_data;
+ int nr_wb_cp_data, nr_wb_data;
int nr_rd_data, nr_rd_node, nr_rd_meta;
int nr_dio_read, nr_dio_write;
unsigned int io_skip_bggc, other_skip_bggc;
@@ -3905,9 +3921,11 @@ struct f2fs_stat_info {
int rsvd_segs, overp_segs;
int dirty_count, node_pages, meta_pages, compress_pages;
int compress_page_hit;
- int prefree_count, call_count, cp_count, bg_cp_count;
- int tot_segs, node_segs, data_segs, free_segs, free_secs;
- int bg_node_segs, bg_data_segs;
+ int prefree_count, free_segs, free_secs;
+ int cp_call_count[MAX_CALL_TYPE], cp_count;
+ int gc_call_count[MAX_CALL_TYPE];
+ int gc_segs[2][2];
+ int gc_secs[2][2];
int tot_blks, data_blks, node_blks;
int bg_data_blks, bg_node_blks;
int curseg[NR_CURSEG_TYPE];
@@ -3929,10 +3947,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
return (struct f2fs_stat_info *)sbi->stat_info;
}
-#define stat_inc_cp_count(si) ((si)->cp_count++)
-#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++)
-#define stat_inc_call_count(si) ((si)->call_count++)
-#define stat_inc_bggc_count(si) ((si)->bg_gc++)
+#define stat_inc_cp_call_count(sbi, foreground) \
+ atomic_inc(&sbi->cp_call_count[(foreground)])
+#define stat_inc_cp_count(si) (F2FS_STAT(sbi)->cp_count++)
#define stat_io_skip_bggc_count(sbi) ((sbi)->io_skip_bggc++)
#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
@@ -4017,18 +4034,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
if (cur > max) \
atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \
} while (0)
-#define stat_inc_seg_count(sbi, type, gc_type) \
- do { \
- struct f2fs_stat_info *si = F2FS_STAT(sbi); \
- si->tot_segs++; \
- if ((type) == SUM_TYPE_DATA) { \
- si->data_segs++; \
- si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
- } else { \
- si->node_segs++; \
- si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \
- } \
- } while (0)
+#define stat_inc_gc_call_count(sbi, foreground) \
+ (F2FS_STAT(sbi)->gc_call_count[(foreground)]++)
+#define stat_inc_gc_sec_count(sbi, type, gc_type) \
+ (F2FS_STAT(sbi)->gc_secs[(type)][(gc_type)]++)
+#define stat_inc_gc_seg_count(sbi, type, gc_type) \
+ (F2FS_STAT(sbi)->gc_segs[(type)][(gc_type)]++)
#define stat_inc_tot_blk_count(si, blks) \
((si)->tot_blks += (blks))
@@ -4055,10 +4066,8 @@ void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#else
-#define stat_inc_cp_count(si) do { } while (0)
-#define stat_inc_bg_cp_count(si) do { } while (0)
-#define stat_inc_call_count(si) do { } while (0)
-#define stat_inc_bggc_count(si) do { } while (0)
+#define stat_inc_cp_call_count(sbi, foreground) do { } while (0)
+#define stat_inc_cp_count(sbi) do { } while (0)
#define stat_io_skip_bggc_count(sbi) do { } while (0)
#define stat_other_skip_bggc_count(sbi) do { } while (0)
#define stat_inc_dirty_inode(sbi, type) do { } while (0)
@@ -4086,7 +4095,9 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_inc_seg_type(sbi, curseg) do { } while (0)
#define stat_inc_block_count(sbi, curseg) do { } while (0)
#define stat_inc_inplace_blocks(sbi) do { } while (0)
-#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0)
+#define stat_inc_gc_call_count(sbi, foreground) do { } while (0)
+#define stat_inc_gc_sec_count(sbi, type, gc_type) do { } while (0)
+#define stat_inc_gc_seg_count(sbi, type, gc_type) do { } while (0)
#define stat_inc_tot_blk_count(si, blks) do { } while (0)
#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0)
#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0)
@@ -4423,6 +4434,22 @@ static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
}
#endif
+static inline int f2fs_bdev_index(struct f2fs_sb_info *sbi,
+ struct block_device *bdev)
+{
+ int i;
+
+ if (!f2fs_is_multi_device(sbi))
+ return 0;
+
+ for (i = 0; i < sbi->s_ndevs; i++)
+ if (FDEV(i).bdev == bdev)
+ return i;
+
+ WARN_ON(1);
+ return -1;
+}
+
static inline bool f2fs_hw_should_discard(struct f2fs_sb_info *sbi)
{
return f2fs_sb_has_blkzoned(sbi);
@@ -4483,7 +4510,8 @@ static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
- f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode))
+ f2fs_is_atomic_file(inode) || f2fs_has_inline_data(inode) ||
+ f2fs_is_mmap_file(inode))
return false;
return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 093039dee992..ca5904129b16 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -159,7 +159,7 @@ out_sem:
sb_end_pagefault(inode->i_sb);
err:
- return block_page_mkwrite_return(err);
+ return vmf_fs_error(err);
}
static const struct vm_operations_struct f2fs_file_vm_ops = {
@@ -526,7 +526,11 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
vma->vm_ops = &f2fs_file_vm_ops;
+
+ f2fs_down_read(&F2FS_I(inode)->i_sem);
set_inode_flag(inode, FI_MMAP_FILE);
+ f2fs_up_read(&F2FS_I(inode)->i_sem);
+
return 0;
}
@@ -794,7 +798,7 @@ int f2fs_truncate(struct inode *inode)
if (err)
return err;
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, false);
return 0;
}
@@ -882,7 +886,7 @@ int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
/* we need to show initial sectors used for inline_data/dentries */
if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) ||
@@ -905,7 +909,7 @@ static void __setattr_copy(struct mnt_idmap *idmap,
if (ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
@@ -1008,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
return err;
spin_lock(&F2FS_I(inode)->i_size_lock);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
@@ -1724,6 +1728,7 @@ next_alloc:
if (has_not_enough_free_secs(sbi, 0,
GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
f2fs_down_write(&sbi->gc_lock);
+ stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
if (err && err != -ENODATA)
goto out_err;
@@ -1835,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode,
}
if (!ret) {
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, false);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
}
@@ -1919,12 +1924,19 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
int err = f2fs_convert_inline_inode(inode);
if (err)
return err;
- if (!f2fs_may_compress(inode))
- return -EINVAL;
- if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
+
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
+ if (!f2fs_may_compress(inode) ||
+ (S_ISREG(inode->i_mode) &&
+ F2FS_HAS_BLOCKS(inode))) {
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
return -EINVAL;
- if (set_compress_context(inode))
- return -EOPNOTSUPP;
+ }
+ err = set_compress_context(inode);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
+
+ if (err)
+ return err;
}
}
@@ -1937,7 +1949,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
else
clear_inode_flag(inode, FI_PROJ_INHERIT);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_set_inode_flags(inode);
f2fs_mark_inode_dirty_sync(inode, true);
return 0;
@@ -2465,6 +2477,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
gc_control.init_gc_type = sync ? FG_GC : BG_GC;
gc_control.err_gc_skipped = sync;
+ stat_inc_gc_call_count(sbi, FOREGROUND);
ret = f2fs_gc(sbi, &gc_control);
out:
mnt_drop_write_file(filp);
@@ -2508,6 +2521,7 @@ do_more:
}
gc_control.victim_segno = GET_SEGNO(sbi, range->start);
+ stat_inc_gc_call_count(sbi, FOREGROUND);
ret = f2fs_gc(sbi, &gc_control);
if (ret) {
if (ret == -EBUSY)
@@ -2874,10 +2888,10 @@ out_src:
if (ret)
goto out_unlock;
- src->i_mtime = src->i_ctime = current_time(src);
+ src->i_mtime = inode_set_ctime_current(src);
f2fs_mark_inode_dirty_sync(src, false);
if (src != dst) {
- dst->i_mtime = dst->i_ctime = current_time(dst);
+ dst->i_mtime = inode_set_ctime_current(dst);
f2fs_mark_inode_dirty_sync(dst, false);
}
f2fs_update_time(sbi, REQ_TIME);
@@ -2990,6 +3004,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
sm->last_victim[ALLOC_NEXT] = end_segno + 1;
gc_control.victim_segno = start_segno;
+ stat_inc_gc_call_count(sbi, FOREGROUND);
ret = f2fs_gc(sbi, &gc_control);
if (ret == -EAGAIN)
ret = 0;
@@ -3073,7 +3088,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
goto out_unlock;
fi->i_projid = kprojid;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
out_unlock:
f2fs_unlock_op(sbi);
@@ -3511,7 +3526,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
}
set_inode_flag(inode, FI_COMPRESS_RELEASED);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
@@ -3710,7 +3725,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
if (ret >= 0) {
clear_inode_flag(inode, FI_COMPRESS_RELEASED);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, true);
}
unlock_inode:
@@ -3976,6 +3991,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
file_start_write(filp);
inode_lock(inode);
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) {
ret = -EBUSY;
goto out;
@@ -3995,6 +4011,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
f2fs_warn(sbi, "compression algorithm is successfully set, "
"but current kernel doesn't support this algorithm.");
out:
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
inode_unlock(inode);
file_end_write(filp);
@@ -4079,10 +4096,8 @@ static int f2fs_ioc_decompress_file(struct file *filp)
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
count = last_idx - page_idx;
- while (count) {
- int len = min(cluster_size, count);
-
- ret = redirty_blocks(inode, page_idx, len);
+ while (count && count >= cluster_size) {
+ ret = redirty_blocks(inode, page_idx, cluster_size);
if (ret < 0)
break;
@@ -4092,8 +4107,14 @@ static int f2fs_ioc_decompress_file(struct file *filp)
break;
}
- count -= len;
- page_idx += len;
+ count -= cluster_size;
+ page_idx += cluster_size;
+
+ cond_resched();
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
}
if (!ret)
@@ -4153,10 +4174,8 @@ static int f2fs_ioc_compress_file(struct file *filp)
last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
count = last_idx - page_idx;
- while (count) {
- int len = min(cluster_size, count);
-
- ret = redirty_blocks(inode, page_idx, len);
+ while (count && count >= cluster_size) {
+ ret = redirty_blocks(inode, page_idx, cluster_size);
if (ret < 0)
break;
@@ -4166,8 +4185,14 @@ static int f2fs_ioc_compress_file(struct file *filp)
break;
}
- count -= len;
- page_idx += len;
+ count -= cluster_size;
+ page_idx += cluster_size;
+
+ cond_resched();
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
}
if (!ret)
@@ -4579,6 +4604,7 @@ static int f2fs_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
dec_page_count(sbi, F2FS_DIO_WRITE);
if (error)
return error;
+ f2fs_update_time(sbi, REQ_TIME);
f2fs_update_iostat(sbi, NULL, APP_DIRECT_IO, size);
return 0;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 01effd3fcb6c..f550cdeaa663 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -121,8 +121,8 @@ static int gc_thread_func(void *data)
else
increase_sleep_time(gc_th, &wait_ms);
do_gc:
- if (!foreground)
- stat_inc_bggc_count(sbi->stat_info);
+ stat_inc_gc_call_count(sbi, foreground ?
+ FOREGROUND : BACKGROUND);
sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
@@ -1685,6 +1685,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
int seg_freed = 0, migrated = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
+ unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE;
int submitted = 0;
if (__is_large_section(sbi))
@@ -1766,7 +1767,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
segno, gc_type,
force_migrate);
- stat_inc_seg_count(sbi, type, gc_type);
+ stat_inc_gc_seg_count(sbi, data_type, gc_type);
sbi->gc_reclaimed_segs[sbi->gc_mode]++;
migrated++;
@@ -1783,12 +1784,12 @@ skip:
}
if (submitted)
- f2fs_submit_merged_write(sbi,
- (type == SUM_TYPE_NODE) ? NODE : DATA);
+ f2fs_submit_merged_write(sbi, data_type);
blk_finish_plug(&plug);
- stat_inc_call_count(sbi->stat_info);
+ if (migrated)
+ stat_inc_gc_sec_count(sbi, data_type, gc_type);
return seg_freed;
}
@@ -1839,6 +1840,7 @@ gc_more:
* secure free segments which doesn't need fggc any more.
*/
if (prefree_segments(sbi)) {
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
ret = f2fs_write_checkpoint(sbi, &cpc);
if (ret)
goto stop;
@@ -1887,6 +1889,7 @@ retry:
round++;
if (skipped_round > MAX_SKIP_GC_COUNT &&
skipped_round * 2 >= round) {
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
ret = f2fs_write_checkpoint(sbi, &cpc);
goto stop;
}
@@ -1902,6 +1905,7 @@ retry:
*/
if (free_sections(sbi) <= upper_secs + NR_GC_CHECKPOINT_SECS &&
prefree_segments(sbi)) {
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
ret = f2fs_write_checkpoint(sbi, &cpc);
if (ret)
goto stop;
@@ -2029,6 +2033,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
if (gc_only)
goto out;
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
if (err)
goto out;
@@ -2181,12 +2186,14 @@ out_drop_write:
if (err)
return err;
- err = freeze_super(sbi->sb);
+ err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
if (err)
return err;
if (f2fs_readonly(sbi->sb)) {
- thaw_super(sbi->sb);
+ err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ if (err)
+ return err;
return -EROFS;
}
@@ -2221,6 +2228,7 @@ out_drop_write:
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
set_sbi_flag(sbi, SBI_IS_DIRTY);
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
if (err) {
update_fs_metadata(sbi, secs);
@@ -2240,6 +2248,6 @@ recover_out:
out_err:
f2fs_up_write(&sbi->cp_global_sem);
f2fs_up_write(&sbi->gc_lock);
- thaw_super(sbi->sb);
+ thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
return err;
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 4638fee16a91..2fe25619ccb5 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -641,7 +641,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
}
if (inode) {
- f2fs_down_write(&F2FS_I(inode)->i_sem);
+ f2fs_down_write_nested(&F2FS_I(inode)->i_sem,
+ SINGLE_DEPTH_NESTING);
page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -698,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
set_page_dirty(page);
f2fs_put_page(page, 1);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
f2fs_mark_inode_dirty_sync(dir, false);
if (inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 09e986b050c6..cde243840abd 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -214,7 +214,7 @@ static bool sanity_check_compress_inode(struct inode *inode,
f2fs_warn(sbi,
"%s: inode (ino=%lx) has unsupported compress algorithm: %u, run fsck to fix",
__func__, inode->i_ino, ri->i_compress_algorithm);
- goto err;
+ return false;
}
if (le64_to_cpu(ri->i_compr_blocks) >
SECTOR_TO_BLOCK(inode->i_blocks)) {
@@ -222,14 +222,14 @@ static bool sanity_check_compress_inode(struct inode *inode,
"%s: inode (ino=%lx) has inconsistent i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix",
__func__, inode->i_ino, le64_to_cpu(ri->i_compr_blocks),
SECTOR_TO_BLOCK(inode->i_blocks));
- goto err;
+ return false;
}
if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
f2fs_warn(sbi,
"%s: inode (ino=%lx) has unsupported log cluster size: %u, run fsck to fix",
__func__, inode->i_ino, ri->i_log_cluster_size);
- goto err;
+ return false;
}
clevel = le16_to_cpu(ri->i_compress_flag) >>
@@ -273,8 +273,6 @@ static bool sanity_check_compress_inode(struct inode *inode,
err_level:
f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported compress level: %u, run fsck to fix",
__func__, inode->i_ino, clevel);
-err:
- set_sbi_flag(sbi, SBI_NEED_FSCK);
return false;
}
@@ -287,14 +285,12 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks);
if (!iblocks) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.",
__func__, inode->i_ino, iblocks);
return false;
}
if (ino_of_node(node_page) != nid_of_node(node_page)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.",
__func__, inode->i_ino,
ino_of_node(node_page), nid_of_node(node_page));
@@ -303,7 +299,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
if (f2fs_has_extra_attr(inode)) {
if (!f2fs_sb_has_extra_attr(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off",
__func__, inode->i_ino);
return false;
@@ -311,7 +306,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE ||
fi->i_extra_isize < F2FS_MIN_EXTRA_ATTR_SIZE ||
fi->i_extra_isize % sizeof(__le32)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, max: %zu",
__func__, inode->i_ino, fi->i_extra_isize,
F2FS_TOTAL_EXTRA_ATTR_SIZE);
@@ -321,7 +315,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
f2fs_has_inline_xattr(inode) &&
(!fi->i_inline_xattr_size ||
fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has corrupted i_inline_xattr_size: %d, max: %zu",
__func__, inode->i_ino, fi->i_inline_xattr_size,
MAX_INLINE_XATTR_SIZE);
@@ -335,7 +328,6 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
} else if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode ino=%lx, run fsck to fix.",
__func__, inode->i_ino);
return false;
@@ -343,31 +335,26 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
if (!f2fs_sb_has_extra_attr(sbi)) {
if (f2fs_sb_has_project_quota(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.",
__func__, inode->i_ino, F2FS_FEATURE_PRJQUOTA);
return false;
}
if (f2fs_sb_has_inode_chksum(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.",
__func__, inode->i_ino, F2FS_FEATURE_INODE_CHKSUM);
return false;
}
if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.",
__func__, inode->i_ino, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
return false;
}
if (f2fs_sb_has_inode_crtime(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.",
__func__, inode->i_ino, F2FS_FEATURE_INODE_CRTIME);
return false;
}
if (f2fs_sb_has_compression(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: corrupted inode ino=%lx, wrong feature flag: %u, run fsck to fix.",
__func__, inode->i_ino, F2FS_FEATURE_COMPRESSION);
return false;
@@ -375,21 +362,18 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
if (f2fs_sanity_check_inline_data(inode)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
__func__, inode->i_ino, inode->i_mode);
return false;
}
if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_dentry, run fsck to fix",
__func__, inode->i_ino, inode->i_mode);
return false;
}
if ((fi->i_flags & F2FS_CASEFOLD_FL) && !f2fs_sb_has_casefold(sbi)) {
- set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx) has casefold flag, but casefold feature is off",
__func__, inode->i_ino);
return false;
@@ -403,7 +387,7 @@ static void init_idisk_time(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
fi->i_disk_time[0] = inode->i_atime;
- fi->i_disk_time[1] = inode->i_ctime;
+ fi->i_disk_time[1] = inode_get_ctime(inode);
fi->i_disk_time[2] = inode->i_mtime;
}
@@ -434,10 +418,10 @@ static int do_read_inode(struct inode *inode)
inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1);
inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
- inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
+ inode_set_ctime(inode, le64_to_cpu(ri->i_ctime),
+ le32_to_cpu(ri->i_ctime_nsec));
inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
inode->i_generation = le32_to_cpu(ri->i_generation);
if (S_ISDIR(inode->i_mode))
@@ -475,6 +459,13 @@ static int do_read_inode(struct inode *inode)
fi->i_inline_xattr_size = 0;
}
+ if (!sanity_check_inode(inode, node_page)) {
+ f2fs_put_page(node_page, 1);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
+ return -EFSCORRUPTED;
+ }
+
/* check data exist */
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
__recover_inline_status(inode, node_page);
@@ -544,12 +535,6 @@ static int do_read_inode(struct inode *inode)
f2fs_init_read_extent_tree(inode, node_page);
f2fs_init_age_extent_tree(inode);
- if (!sanity_check_inode(inode, node_page)) {
- f2fs_put_page(node_page, 1);
- f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
- return -EFSCORRUPTED;
- }
-
if (!sanity_check_extent_cache(inode)) {
f2fs_put_page(node_page, 1);
f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
@@ -714,10 +699,10 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
set_raw_inline(inode, ri);
ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
- ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
if (S_ISDIR(inode->i_mode))
ri->i_current_depth =
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index bee0568888da..193b22a2d6bf 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -243,7 +243,7 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap,
inode->i_ino = ino;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
F2FS_I(inode)->i_crtime = inode->i_mtime;
inode->i_generation = get_random_u32();
@@ -420,7 +420,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
f2fs_balance_fs(sbi, true);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ihold(inode);
set_inode_flag(inode, FI_INC_LINK);
@@ -1052,7 +1052,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_page = NULL;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
f2fs_i_links_write(new_inode, false);
@@ -1086,7 +1086,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
f2fs_i_pino_write(old_inode, new_dir->i_ino);
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
@@ -1251,7 +1251,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_pino_write(old_inode, new_dir->i_ino);
f2fs_up_write(&F2FS_I(old_inode)->i_sem);
- old_dir->i_ctime = current_time(old_dir);
+ inode_set_ctime_current(old_dir);
if (old_nlink) {
f2fs_down_write(&F2FS_I(old_dir)->i_sem);
f2fs_i_links_write(old_dir, old_nlink > 0);
@@ -1270,7 +1270,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_i_pino_write(new_inode, old_dir->i_ino);
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
- new_dir->i_ctime = current_time(new_dir);
+ inode_set_ctime_current(new_dir);
if (new_nlink) {
f2fs_down_write(&F2FS_I(new_dir)->i_sem);
f2fs_i_links_write(new_dir, new_nlink > 0);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 4e7d4ceeb084..7be60df277a5 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -321,10 +321,10 @@ static int recover_inode(struct inode *inode, struct page *page)
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
- inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
+ inode_set_ctime(inode, le64_to_cpu(raw->i_ctime),
+ le32_to_cpu(raw->i_ctime_nsec));
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
F2FS_I(inode)->i_advise = raw->i_advise;
@@ -924,6 +924,7 @@ skip:
struct cp_control cpc = {
.reason = CP_RECOVERY,
};
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
}
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0457d620011f..d05b41608fc0 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -205,6 +205,8 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
f2fs_i_size_write(inode, fi->original_i_size);
fi->original_i_size = 0;
}
+ /* avoid stale dirty inode during eviction */
+ sync_inode_metadata(inode, 0);
}
static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -433,6 +435,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
.err_gc_skipped = false,
.nr_free_secs = 1 };
f2fs_down_write(&sbi->gc_lock);
+ stat_inc_gc_call_count(sbi, FOREGROUND);
f2fs_gc(sbi, &gc_control);
}
}
@@ -510,8 +513,8 @@ do_sync:
mutex_unlock(&sbi->flush_lock);
}
+ stat_inc_cp_call_count(sbi, BACKGROUND);
f2fs_sync_fs(sbi->sb, 1);
- stat_inc_bg_cp_count(sbi->stat_info);
}
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
@@ -1258,8 +1261,16 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) {
- __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued);
- return 0;
+ int devi = f2fs_bdev_index(sbi, bdev);
+
+ if (devi < 0)
+ return -EINVAL;
+
+ if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) {
+ __submit_zone_reset_cmd(sbi, dc, flag,
+ wait_list, issued);
+ return 0;
+ }
}
#endif
@@ -1785,15 +1796,24 @@ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr)
dc = __lookup_discard_cmd(sbi, blkaddr);
#ifdef CONFIG_BLK_DEV_ZONED
if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) {
- /* force submit zone reset */
- if (dc->state == D_PREP)
- __submit_zone_reset_cmd(sbi, dc, REQ_SYNC,
- &dcc->wait_list, NULL);
- dc->ref++;
- mutex_unlock(&dcc->cmd_lock);
- /* wait zone reset */
- __wait_one_discard_bio(sbi, dc);
- return;
+ int devi = f2fs_bdev_index(sbi, dc->bdev);
+
+ if (devi < 0) {
+ mutex_unlock(&dcc->cmd_lock);
+ return;
+ }
+
+ if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) {
+ /* force submit zone reset */
+ if (dc->state == D_PREP)
+ __submit_zone_reset_cmd(sbi, dc, REQ_SYNC,
+ &dcc->wait_list, NULL);
+ dc->ref++;
+ mutex_unlock(&dcc->cmd_lock);
+ /* wait zone reset */
+ __wait_one_discard_bio(sbi, dc);
+ return;
+ }
}
#endif
if (dc) {
@@ -2193,7 +2213,7 @@ find_next:
len = next_pos - cur_pos;
if (f2fs_sb_has_blkzoned(sbi) ||
- !force || len < cpc->trim_minlen)
+ (force && len < cpc->trim_minlen))
goto skip;
f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
@@ -3228,6 +3248,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
goto out;
f2fs_down_write(&sbi->gc_lock);
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
f2fs_up_write(&sbi->gc_lock);
if (err)
@@ -4846,17 +4867,17 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
{
unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno;
block_t zone_block, wp_block, last_valid_block;
+ unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
int i, s, b, ret;
struct seg_entry *se;
if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
- wp_block = fdev->start_blk + (zone->wp >> sbi->log_sectors_per_block);
+ wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block);
wp_segno = GET_SEGNO(sbi, wp_block);
wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
- zone_block = fdev->start_blk + (zone->start >>
- sbi->log_sectors_per_block);
+ zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block);
zone_segno = GET_SEGNO(sbi, zone_block);
zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno);
@@ -4906,7 +4927,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
"pointer. Reset the write pointer: wp[0x%x,0x%x]",
wp_segno, wp_blkoff);
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
- zone->len >> sbi->log_sectors_per_block);
+ zone->len >> log_sectors_per_block);
if (ret)
f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
fdev->path, ret);
@@ -4927,12 +4948,19 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
wp_segno, wp_blkoff);
- ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
- zone->len - (zone->wp - zone->start),
- GFP_NOFS, 0);
- if (ret)
- f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)",
- fdev->path, ret);
+ ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
+ zone->start, zone->len, GFP_NOFS);
+ if (ret == -EOPNOTSUPP) {
+ ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
+ zone->len - (zone->wp - zone->start),
+ GFP_NOFS, 0);
+ if (ret)
+ f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)",
+ fdev->path, ret);
+ } else if (ret) {
+ f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)",
+ fdev->path, ret);
+ }
return ret;
}
@@ -4967,6 +4995,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
struct blk_zone zone;
unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off;
block_t cs_zone_block, wp_block;
+ unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
sector_t zone_sector;
int err;
@@ -4978,8 +5007,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
return 0;
/* report zone for the sector the curseg points to */
- zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) <<
- sbi->log_sectors_per_block;
+ zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
+ << log_sectors_per_block;
err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
report_one_zone_cb, &zone);
if (err != 1) {
@@ -4991,10 +5020,10 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
- wp_block = zbd->start_blk + (zone.wp >> sbi->log_sectors_per_block);
+ wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block);
wp_segno = GET_SEGNO(sbi, wp_block);
wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
- wp_sector_off = zone.wp & GENMASK(sbi->log_sectors_per_block - 1, 0);
+ wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0);
if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff &&
wp_sector_off == 0)
@@ -5021,8 +5050,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
if (!zbd)
return 0;
- zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) <<
- sbi->log_sectors_per_block;
+ zone_sector = (sector_t)(cs_zone_block - zbd->start_blk)
+ << log_sectors_per_block;
err = blkdev_report_zones(zbd->bdev, zone_sector, 1,
report_one_zone_cb, &zone);
if (err != 1) {
@@ -5040,7 +5069,7 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
"Reset the zone: curseg[0x%x,0x%x]",
type, cs->segno, cs->next_blkoff);
err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block,
- zone.len >> sbi->log_sectors_per_block);
+ zone.len >> log_sectors_per_block);
if (err) {
f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
zbd->path, err);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ca31163da00a..a8c8232852bb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -591,7 +591,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
unsigned int level;
if (strlen(str) == 3) {
- F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL;
+ F2FS_OPTION(sbi).compress_level = 0;
return 0;
}
@@ -862,11 +862,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
if (!name)
return -ENOMEM;
if (!strcmp(name, "adaptive")) {
- if (f2fs_sb_has_blkzoned(sbi)) {
- f2fs_warn(sbi, "adaptive mode is not allowed with zoned block device feature");
- kfree(name);
- return -EINVAL;
- }
F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
} else if (!strcmp(name, "lfs")) {
F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
@@ -1331,6 +1326,11 @@ default_check:
F2FS_OPTION(sbi).discard_unit =
DISCARD_UNIT_SECTION;
}
+
+ if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) {
+ f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature");
+ return -EINVAL;
+ }
#else
f2fs_err(sbi, "Zoned block device support is not enabled");
return -EINVAL;
@@ -1561,7 +1561,8 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
int i;
for (i = 0; i < sbi->s_ndevs; i++) {
- blkdev_put(FDEV(i).bdev, sbi->sb->s_type);
+ if (i > 0)
+ blkdev_put(FDEV(i).bdev, sbi->sb);
#ifdef CONFIG_BLK_DEV_ZONED
kvfree(FDEV(i).blkz_seq);
#endif
@@ -1600,6 +1601,7 @@ static void f2fs_put_super(struct super_block *sb)
struct cp_control cpc = {
.reason = CP_UMOUNT,
};
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
}
@@ -1609,6 +1611,7 @@ static void f2fs_put_super(struct super_block *sb)
struct cp_control cpc = {
.reason = CP_UMOUNT | CP_TRIMMED,
};
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
}
@@ -1705,8 +1708,10 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return -EAGAIN;
- if (sync)
+ if (sync) {
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_issue_checkpoint(sbi);
+ }
return err;
}
@@ -2205,6 +2210,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
.nr_free_secs = 1 };
f2fs_down_write(&sbi->gc_lock);
+ stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
if (err == -ENODATA) {
err = 0;
@@ -2230,6 +2236,7 @@ skip_gc:
f2fs_down_write(&sbi->gc_lock);
cpc.reason = CP_PAUSE;
set_sbi_flag(sbi, SBI_CP_DISABLED);
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
err = f2fs_write_checkpoint(sbi, &cpc);
if (err)
goto out_unlock;
@@ -2703,7 +2710,7 @@ retry:
if (len == towrite)
return err;
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
f2fs_mark_inode_dirty_sync(inode, false);
return len - towrite;
}
@@ -4190,16 +4197,12 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
sbi->aligned_blksize = true;
for (i = 0; i < max_devices; i++) {
-
- if (i > 0 && !RDEV(i).path[0])
+ if (i == 0)
+ FDEV(0).bdev = sbi->sb->s_bdev;
+ else if (!RDEV(i).path[0])
break;
- if (max_devices == 1) {
- /* Single zoned block device mount */
- FDEV(0).bdev =
- blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode,
- sbi->sb->s_type, NULL);
- } else {
+ if (max_devices > 1) {
/* Multi-device mount */
memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
FDEV(i).total_segments =
@@ -4215,10 +4218,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
FDEV(i).end_blk = FDEV(i).start_blk +
(FDEV(i).total_segments <<
sbi->log_blocks_per_seg) - 1;
+ FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+ mode, sbi->sb, NULL);
}
- FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode,
- sbi->sb->s_type,
- NULL);
}
if (IS_ERR(FDEV(i).bdev))
return PTR_ERR(FDEV(i).bdev);
@@ -4871,6 +4873,7 @@ static void kill_f2fs_super(struct super_block *sb)
struct cp_control cpc = {
.reason = CP_UMOUNT,
};
+ stat_inc_cp_call_count(sbi, TOTAL_CALL);
f2fs_write_checkpoint(sbi, &cpc);
}
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 48b7e0073884..417fae96890f 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -356,6 +356,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
if (!strcmp(a->attr.name, "revoked_atomic_block"))
return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block);
+#ifdef CONFIG_F2FS_STAT_FS
+ if (!strcmp(a->attr.name, "cp_foreground_calls"))
+ return sysfs_emit(buf, "%d\n",
+ atomic_read(&sbi->cp_call_count[TOTAL_CALL]) -
+ atomic_read(&sbi->cp_call_count[BACKGROUND]));
+ if (!strcmp(a->attr.name, "cp_background_calls"))
+ return sysfs_emit(buf, "%d\n",
+ atomic_read(&sbi->cp_call_count[BACKGROUND]));
+#endif
+
ui = (unsigned int *)(ptr + a->offset);
return sysfs_emit(buf, "%u\n", *ui);
@@ -972,10 +982,10 @@ F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
-STAT_INFO_RO_ATTR(cp_foreground_calls, cp_count);
-STAT_INFO_RO_ATTR(cp_background_calls, bg_cp_count);
-STAT_INFO_RO_ATTR(gc_foreground_calls, call_count);
-STAT_INFO_RO_ATTR(gc_background_calls, bg_gc);
+STAT_INFO_RO_ATTR(cp_foreground_calls, cp_call_count[FOREGROUND]);
+STAT_INFO_RO_ATTR(cp_background_calls, cp_call_count[BACKGROUND]);
+STAT_INFO_RO_ATTR(gc_foreground_calls, gc_call_count[FOREGROUND]);
+STAT_INFO_RO_ATTR(gc_background_calls, gc_call_count[BACKGROUND]);
#endif
/* FAULT_INFO ATTR */
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 476b186b90a6..a657284faee3 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -757,17 +757,17 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
f2fs_set_encrypted_inode(inode);
- f2fs_mark_inode_dirty_sync(inode, true);
- if (!error && S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode))
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
same:
if (is_inode_flag_set(inode, FI_ACL_MODE)) {
inode->i_mode = F2FS_I(inode)->i_acl_mode;
- inode->i_ctime = current_time(inode);
clear_inode_flag(inode, FI_ACL_MODE);
}
+ inode_set_ctime_current(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
exit:
kfree(base_addr);
return error;
diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig
index afe83b4e7172..25fae1c83725 100644
--- a/fs/fat/Kconfig
+++ b/fs/fat/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config FAT_FS
tristate
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e3b690b48e3e..66cf4778cf3b 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -460,8 +460,7 @@ extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi,
const struct timespec64 *ts);
extern int fat_truncate_time(struct inode *inode, struct timespec64 *now,
int flags);
-extern int fat_update_time(struct inode *inode, struct timespec64 *now,
- int flags);
+extern int fat_update_time(struct inode *inode, int flags);
extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs);
int fat_cache_init(void);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 456477946dd9..e887e9ab7472 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -401,7 +401,7 @@ int fat_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->blksize = sbi->cluster_size;
if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d99b8549ec8f..cdd39b6020f3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -562,7 +562,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0);
- inode->i_ctime = inode->i_mtime;
+ inode_set_ctime_to_ts(inode, inode->i_mtime);
if (sbi->options.isvfat) {
fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0);
fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime,
@@ -1407,8 +1407,7 @@ static int fat_read_root(struct inode *inode)
MSDOS_I(inode)->mmu_private = inode->i_size;
fat_save_attrs(inode, ATTR_DIR);
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0;
- inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0);
set_nlink(inode, fat_subdirs(inode)+2);
return 0;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 7e5d6ae305f2..f2304a1054aa 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -332,13 +332,14 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags)
* but ctime updates are ignored.
*/
if (flags & S_MTIME)
- inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now);
+ inode->i_mtime = inode_set_ctime_to_ts(inode,
+ fat_truncate_mtime(sbi, now));
return 0;
}
EXPORT_SYMBOL_GPL(fat_truncate_time);
-int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
+int fat_update_time(struct inode *inode, int flags)
{
int dirty_flags = 0;
@@ -346,16 +347,13 @@ int fat_update_time(struct inode *inode, struct timespec64 *now, int flags)
return 0;
if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
- fat_truncate_time(inode, now, flags);
+ fat_truncate_time(inode, NULL, flags);
if (inode->i_sb->s_flags & SB_LAZYTIME)
dirty_flags |= I_DIRTY_TIME;
else
dirty_flags |= I_DIRTY_SYNC;
}
- if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
- dirty_flags |= I_DIRTY_SYNC;
-
__mark_inode_dirty(inode, dirty_flags);
return 0;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index b622be119706..e871009f6c88 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -34,7 +34,7 @@
#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
-static int setfl(int fd, struct file * filp, unsigned long arg)
+static int setfl(int fd, struct file * filp, unsigned int arg)
{
struct inode * inode = file_inode(filp);
int error = 0;
@@ -112,11 +112,11 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
}
EXPORT_SYMBOL(__f_setown);
-int f_setown(struct file *filp, unsigned long arg, int force)
+int f_setown(struct file *filp, int who, int force)
{
enum pid_type type;
struct pid *pid = NULL;
- int who = arg, ret = 0;
+ int ret = 0;
type = PIDTYPE_TGID;
if (who < 0) {
@@ -317,28 +317,29 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
struct file *filp)
{
void __user *argp = (void __user *)arg;
+ int argi = (int)arg;
struct flock flock;
long err = -EINVAL;
switch (cmd) {
case F_DUPFD:
- err = f_dupfd(arg, filp, 0);
+ err = f_dupfd(argi, filp, 0);
break;
case F_DUPFD_CLOEXEC:
- err = f_dupfd(arg, filp, O_CLOEXEC);
+ err = f_dupfd(argi, filp, O_CLOEXEC);
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
break;
case F_SETFD:
err = 0;
- set_close_on_exec(fd, arg & FD_CLOEXEC);
+ set_close_on_exec(fd, argi & FD_CLOEXEC);
break;
case F_GETFL:
err = filp->f_flags;
break;
case F_SETFL:
- err = setfl(fd, filp, arg);
+ err = setfl(fd, filp, argi);
break;
#if BITS_PER_LONG != 32
/* 32-bit arches must use fcntl64() */
@@ -375,7 +376,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
force_successful_syscall_return();
break;
case F_SETOWN:
- err = f_setown(filp, arg, 1);
+ err = f_setown(filp, argi, 1);
break;
case F_GETOWN_EX:
err = f_getown_ex(filp, arg);
@@ -391,28 +392,28 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
break;
case F_SETSIG:
/* arg == 0 restores default behaviour. */
- if (!valid_signal(arg)) {
+ if (!valid_signal(argi)) {
break;
}
err = 0;
- filp->f_owner.signum = arg;
+ filp->f_owner.signum = argi;
break;
case F_GETLEASE:
err = fcntl_getlease(filp);
break;
case F_SETLEASE:
- err = fcntl_setlease(fd, filp, arg);
+ err = fcntl_setlease(fd, filp, argi);
break;
case F_NOTIFY:
- err = fcntl_dirnotify(fd, filp, arg);
+ err = fcntl_dirnotify(fd, filp, argi);
break;
case F_SETPIPE_SZ:
case F_GETPIPE_SZ:
- err = pipe_fcntl(filp, cmd, arg);
+ err = pipe_fcntl(filp, cmd, argi);
break;
case F_ADD_SEALS:
case F_GET_SEALS:
- err = memfd_fcntl(filp, cmd, arg);
+ err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
case F_SET_RW_HINT:
diff --git a/fs/file.c b/fs/file.c
index 3fd003a8604f..3e4a4dfa38fc 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -668,7 +668,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */
/**
* last_fd - return last valid index into fd table
- * @cur_fds: files struct
+ * @fdt: File descriptor table.
*
* Context: Either rcu read lock or files_lock must be held.
*
@@ -693,29 +693,30 @@ static inline void __range_cloexec(struct files_struct *cur_fds,
spin_unlock(&cur_fds->file_lock);
}
-static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
+static inline void __range_close(struct files_struct *files, unsigned int fd,
unsigned int max_fd)
{
+ struct file *file;
unsigned n;
- rcu_read_lock();
- n = last_fd(files_fdtable(cur_fds));
- rcu_read_unlock();
+ spin_lock(&files->file_lock);
+ n = last_fd(files_fdtable(files));
max_fd = min(max_fd, n);
- while (fd <= max_fd) {
- struct file *file;
-
- spin_lock(&cur_fds->file_lock);
- file = pick_file(cur_fds, fd++);
- spin_unlock(&cur_fds->file_lock);
-
+ for (; fd <= max_fd; fd++) {
+ file = pick_file(files, fd);
if (file) {
- /* found a valid file to close */
- filp_close(file, cur_fds);
+ spin_unlock(&files->file_lock);
+ filp_close(file, files);
cond_resched();
+ spin_lock(&files->file_lock);
+ } else if (need_resched()) {
+ spin_unlock(&files->file_lock);
+ cond_resched();
+ spin_lock(&files->file_lock);
}
}
+ spin_unlock(&files->file_lock);
}
/**
@@ -723,6 +724,7 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
+ * @flags: CLOSE_RANGE flags.
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
diff --git a/fs/file_table.c b/fs/file_table.c
index fc7d677ff5ad..ee21b3da9d08 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -461,11 +461,8 @@ void fput(struct file *file)
*/
void __fput_sync(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
- struct task_struct *task = current;
- BUG_ON(!(task->flags & PF_KTHREAD));
+ if (atomic_long_dec_and_test(&file->f_count))
__fput(file);
- }
}
EXPORT_SYMBOL(fput);
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index 0e2fc08f7de4..912107ebea6f 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig
@@ -2,6 +2,7 @@
config VXFS_FS
tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
depends on BLOCK
+ select BUFFER_HEAD
help
FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
file system format. VERITAS VxFS(TM) is the standard file system
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ceb6a12649ba..ac5d43b164b5 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -110,10 +110,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
inode->i_size = vip->vii_size;
inode->i_atime.tv_sec = vip->vii_atime;
- inode->i_ctime.tv_sec = vip->vii_ctime;
+ inode_set_ctime(inode, vip->vii_ctime, 0);
inode->i_mtime.tv_sec = vip->vii_mtime;
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
inode->i_blocks = vip->vii_blocks;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index aca4b4811394..c1af01b2c42d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1535,10 +1535,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
if (wbc->pages_skipped) {
/*
- * writeback is not making progress due to locked
- * buffers. Skip this inode for now.
+ * Writeback is not making progress due to locked buffers.
+ * Skip this inode for now. Although having skipped pages
+ * is odd for clean inodes, it can happen for some
+ * filesystems so handle that gracefully.
*/
- redirty_tail_locked(inode, wb);
+ if (inode->i_state & I_DIRTY_ALL)
+ redirty_tail_locked(inode, wb);
+ else
+ inode_cgwb_move_to_attached(inode, wb);
return;
}
@@ -1953,9 +1958,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
struct inode *inode = wb_inode(wb->b_io.prev);
struct super_block *sb = inode->i_sb;
- if (!trylock_super(sb)) {
+ if (!super_trylock_shared(sb)) {
/*
- * trylock_super() may fail consistently due to
+ * super_trylock_shared() may fail consistently due to
* s_umount being grabbed by someone else. Don't use
* requeue_io() to avoid busy retrying the inode/sb.
*/
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 851214d1d013..98589aae5208 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param);
/**
* vfs_parse_fs_string - Convenience function to just parse a string.
+ * @fc: Filesystem context.
+ * @key: Parameter name.
+ * @value: Default value.
+ * @v_size: Maximum number of bytes in the value.
*/
int vfs_parse_fs_string(struct fs_context *fc, const char *key,
const char *value, size_t v_size)
@@ -188,17 +192,19 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key,
EXPORT_SYMBOL(vfs_parse_fs_string);
/**
- * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
- * @ctx: The superblock configuration to fill in.
+ * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data
+ * @fc: The superblock configuration to fill in.
* @data: The data to parse
+ * @sep: callback for separating next option
*
- * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
- * called from the ->monolithic_mount_data() fs_context operation.
+ * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom
+ * option separator callback.
*
* Returns 0 on success or the error returned by the ->parse_option() fs_context
* operation on failure.
*/
-int generic_parse_monolithic(struct fs_context *fc, void *data)
+int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
+ char *(*sep)(char **))
{
char *options = data, *key;
int ret = 0;
@@ -210,7 +216,7 @@ int generic_parse_monolithic(struct fs_context *fc, void *data)
if (ret)
return ret;
- while ((key = strsep(&options, ",")) != NULL) {
+ while ((key = sep(&options)) != NULL) {
if (*key) {
size_t v_len = 0;
char *value = strchr(key, '=');
@@ -229,6 +235,28 @@ int generic_parse_monolithic(struct fs_context *fc, void *data)
return ret;
}
+EXPORT_SYMBOL(vfs_parse_monolithic_sep);
+
+static char *vfs_parse_comma_sep(char **s)
+{
+ return strsep(s, ",");
+}
+
+/**
+ * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data
+ * @fc: The superblock configuration to fill in.
+ * @data: The data to parse
+ *
+ * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be
+ * called from the ->monolithic_mount_data() fs_context operation.
+ *
+ * Returns 0 on success or the error returned by the ->parse_option() fs_context
+ * operation on failure.
+ */
+int generic_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep);
+}
EXPORT_SYMBOL(generic_parse_monolithic);
/**
@@ -315,10 +343,31 @@ struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
}
EXPORT_SYMBOL(fs_context_for_reconfigure);
+/**
+ * fs_context_for_submount: allocate a new fs_context for a submount
+ * @type: file_system_type of the new context
+ * @reference: reference dentry from which to copy relevant info
+ *
+ * Allocate a new fs_context suitable for a submount. This also ensures that
+ * the fc->security object is inherited from @reference (if needed).
+ */
struct fs_context *fs_context_for_submount(struct file_system_type *type,
struct dentry *reference)
{
- return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+ struct fs_context *fc;
+ int ret;
+
+ fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT);
+ if (IS_ERR(fc))
+ return fc;
+
+ ret = security_fs_context_submount(fc, reference->d_sb);
+ if (ret) {
+ put_fs_context(fc);
+ return ERR_PTR(ret);
+ }
+
+ return fc;
}
EXPORT_SYMBOL(fs_context_for_submount);
@@ -333,7 +382,7 @@ void fc_drop_locked(struct fs_context *fc)
static void legacy_fs_context_free(struct fs_context *fc);
/**
- * vfs_dup_fc_config: Duplicate a filesystem context.
+ * vfs_dup_fs_context - Duplicate a filesystem context.
* @src_fc: The context to copy.
*/
struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc)
@@ -379,7 +428,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context);
/**
* logfc - Log a message to a filesystem context
- * @fc: The filesystem context to log to.
+ * @log: The filesystem context to log to, or NULL to use printk.
+ * @prefix: A string to prefix the output with, or NULL.
+ * @level: 'w' for a warning, 'e' for an error. Anything else is a notice.
* @fmt: The format of the buffer.
*/
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...)
@@ -692,6 +743,7 @@ void vfs_clean_context(struct fs_context *fc)
security_free_mnt_opts(&fc->security);
kfree(fc->source);
fc->source = NULL;
+ fc->exclusive = false;
fc->purpose = FS_CONTEXT_FOR_RECONFIGURE;
fc->phase = FS_CONTEXT_AWAITING_RECONF;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 04b3f5b9c629..64c2d0814ed6 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -62,7 +62,7 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
int count = 0;
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
task_lock(p);
fs = p->fs;
if (fs) {
@@ -79,7 +79,7 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
spin_unlock(&fs->lock);
}
task_unlock(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
while (count--)
path_put(old_root);
diff --git a/fs/fsopen.c b/fs/fsopen.c
index fc9d2d9fd234..ce03f6521c88 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -209,6 +209,72 @@ err:
return ret;
}
+static int vfs_cmd_create(struct fs_context *fc, bool exclusive)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
+ return -EBUSY;
+
+ if (!mount_capable(fc))
+ return -EPERM;
+
+ /* require the new mount api */
+ if (exclusive && fc->ops == &legacy_fs_context_ops)
+ return -EOPNOTSUPP;
+
+ fc->phase = FS_CONTEXT_CREATING;
+ fc->exclusive = exclusive;
+
+ ret = vfs_get_tree(fc);
+ if (ret) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ sb = fc->root->d_sb;
+ ret = security_sb_kern_mount(sb);
+ if (unlikely(ret)) {
+ fc_drop_locked(fc);
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ /* vfs_get_tree() callchains will have grabbed @s_umount */
+ up_write(&sb->s_umount);
+ fc->phase = FS_CONTEXT_AWAITING_MOUNT;
+ return 0;
+}
+
+static int vfs_cmd_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb;
+ int ret;
+
+ if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
+ return -EBUSY;
+
+ fc->phase = FS_CONTEXT_RECONFIGURING;
+
+ sb = fc->root->d_sb;
+ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return -EPERM;
+ }
+
+ down_write(&sb->s_umount);
+ ret = reconfigure_super(fc);
+ up_write(&sb->s_umount);
+ if (ret) {
+ fc->phase = FS_CONTEXT_FAILED;
+ return ret;
+ }
+
+ vfs_clean_context(fc);
+ return 0;
+}
+
/*
* Check the state and apply the configuration. Note that this function is
* allowed to 'steal' the value by setting param->xxx to NULL before returning.
@@ -216,7 +282,6 @@ err:
static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
struct fs_parameter *param)
{
- struct super_block *sb;
int ret;
ret = finish_clean_context(fc);
@@ -224,39 +289,11 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
return ret;
switch (cmd) {
case FSCONFIG_CMD_CREATE:
- if (fc->phase != FS_CONTEXT_CREATE_PARAMS)
- return -EBUSY;
- if (!mount_capable(fc))
- return -EPERM;
- fc->phase = FS_CONTEXT_CREATING;
- ret = vfs_get_tree(fc);
- if (ret)
- break;
- sb = fc->root->d_sb;
- ret = security_sb_kern_mount(sb);
- if (unlikely(ret)) {
- fc_drop_locked(fc);
- break;
- }
- up_write(&sb->s_umount);
- fc->phase = FS_CONTEXT_AWAITING_MOUNT;
- return 0;
+ return vfs_cmd_create(fc, false);
+ case FSCONFIG_CMD_CREATE_EXCL:
+ return vfs_cmd_create(fc, true);
case FSCONFIG_CMD_RECONFIGURE:
- if (fc->phase != FS_CONTEXT_RECONF_PARAMS)
- return -EBUSY;
- fc->phase = FS_CONTEXT_RECONFIGURING;
- sb = fc->root->d_sb;
- if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
- ret = -EPERM;
- break;
- }
- down_write(&sb->s_umount);
- ret = reconfigure_super(fc);
- up_write(&sb->s_umount);
- if (ret)
- break;
- vfs_clean_context(fc);
- return 0;
+ return vfs_cmd_reconfigure(fc);
default:
if (fc->phase != FS_CONTEXT_CREATE_PARAMS &&
fc->phase != FS_CONTEXT_RECONF_PARAMS)
@@ -264,8 +301,6 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd,
return vfs_parse_fs_param(fc, param);
}
- fc->phase = FS_CONTEXT_FAILED;
- return ret;
}
/**
@@ -353,6 +388,7 @@ SYSCALL_DEFINE5(fsconfig,
return -EINVAL;
break;
case FSCONFIG_CMD_CREATE:
+ case FSCONFIG_CMD_CREATE_EXCL:
case FSCONFIG_CMD_RECONFIGURE:
if (_key || _value || aux)
return -EINVAL;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 247ef4f76761..ab62e4624256 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
inode->i_mode = mode;
inode->i_uid = fc->user_id;
inode->i_gid = fc->group_id;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
/* setting ->i_op to NULL is not allowed */
if (iop)
inode->i_op = iop;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 8e74f278a3f6..23904a6a9a96 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -784,8 +784,8 @@ static int fuse_dax_writepages(struct address_space *mapping,
return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc);
}
-static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size, bool write)
+static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
+ bool write)
{
vm_fault_t ret;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -809,7 +809,7 @@ retry:
* to populate page cache or access memory we are trying to free.
*/
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops);
+ ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops);
if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) {
error = 0;
retry = true;
@@ -818,7 +818,7 @@ retry:
}
if (ret & VM_FAULT_NEEDDSYNC)
- ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ ret = dax_finish_sync_fault(vmf, order, pfn);
filemap_invalidate_unlock_shared(inode->i_mapping);
if (write)
@@ -829,24 +829,22 @@ retry:
static vm_fault_t fuse_dax_fault(struct vm_fault *vmf)
{
- return __fuse_dax_fault(vmf, PE_SIZE_PTE,
- vmf->flags & FAULT_FLAG_WRITE);
+ return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE);
}
-static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf,
- enum page_entry_size pe_size)
+static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
{
- return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE);
+ return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE);
}
static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf)
{
- return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+ return __fuse_dax_fault(vmf, 0, true);
}
static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf)
{
- return __fuse_dax_fault(vmf, PE_SIZE_PTE, true);
+ return __fuse_dax_fault(vmf, 0, true);
}
static const struct vm_operations_struct fuse_dax_vm_ops = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index f67bef9d83c4..d707e6987da9 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -92,7 +92,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time)
/*
* Calculate the time in jiffies until a dentry/attributes are valid
*/
-static u64 time_to_jiffies(u64 sec, u32 nsec)
+u64 fuse_time_to_jiffies(u64 sec, u32 nsec)
{
if (sec || nsec) {
struct timespec64 ts = {
@@ -112,17 +112,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec)
void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o)
{
fuse_dentry_settime(entry,
- time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
-}
-
-static u64 attr_timeout(struct fuse_attr_out *o)
-{
- return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
-}
-
-u64 entry_attr_timeout(struct fuse_entry_out *o)
-{
- return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+ fuse_time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
}
void fuse_invalidate_attr_mask(struct inode *inode, u32 mask)
@@ -265,8 +255,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
goto invalid;
forget_all_cached_acls(inode);
- fuse_change_attributes(inode, &outarg.attr,
- entry_attr_timeout(&outarg),
+ fuse_change_attributes(inode, &outarg.attr, NULL,
+ ATTR_TIMEOUT(&outarg),
attr_version);
fuse_change_entry_timeout(entry, &outarg);
} else if (inode) {
@@ -360,10 +350,14 @@ int fuse_valid_type(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
+static bool fuse_valid_size(u64 size)
+{
+ return size <= LLONG_MAX;
+}
+
bool fuse_invalid_attr(struct fuse_attr *attr)
{
- return !fuse_valid_type(attr->mode) ||
- attr->size > LLONG_MAX;
+ return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size);
}
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
@@ -399,7 +393,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
goto out_put_forget;
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
- &outarg->attr, entry_attr_timeout(outarg),
+ &outarg->attr, ATTR_TIMEOUT(outarg),
attr_version);
err = -ENOMEM;
if (!*inode) {
@@ -686,7 +680,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
ff->nodeid = outentry.nodeid;
ff->open_flags = outopen.open_flags;
inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
- &outentry.attr, entry_attr_timeout(&outentry), 0);
+ &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
if (!inode) {
flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
fuse_sync_release(NULL, ff, flags);
@@ -755,7 +749,8 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
if (err == -ENOSYS) {
fc->no_create = 1;
goto mknod;
- }
+ } else if (err == -EEXIST)
+ fuse_invalidate_entry(entry);
out_dput:
dput(res);
return err;
@@ -813,7 +808,7 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
goto out_put_forget_req;
inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
- &outarg.attr, entry_attr_timeout(&outarg), 0);
+ &outarg.attr, ATTR_TIMEOUT(&outarg), 0);
if (!inode) {
fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
return -ENOMEM;
@@ -835,6 +830,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
return 0;
out_put_forget_req:
+ if (err == -EEXIST)
+ fuse_invalidate_entry(entry);
kfree(forget);
return err;
}
@@ -933,7 +930,7 @@ void fuse_flush_time_update(struct inode *inode)
static void fuse_update_ctime_in_cache(struct inode *inode)
{
if (!IS_NOCMTIME(inode)) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty_sync(inode);
fuse_flush_time_update(inode);
}
@@ -986,7 +983,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
if (!err) {
fuse_dir_changed(dir);
fuse_entry_unlinked(entry);
- } else if (err == -EINTR)
+ } else if (err == -EINTR || err == -ENOENT)
fuse_invalidate_entry(entry);
return err;
}
@@ -1009,7 +1006,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
if (!err) {
fuse_dir_changed(dir);
fuse_entry_unlinked(entry);
- } else if (err == -EINTR)
+ } else if (err == -EINTR || err == -ENOENT)
fuse_invalidate_entry(entry);
return err;
}
@@ -1050,7 +1047,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
/* newent will end up negative */
if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent))
fuse_entry_unlinked(newent);
- } else if (err == -EINTR) {
+ } else if (err == -EINTR || err == -ENOENT) {
/* If request was interrupted, DEITY only knows if the
rename actually took place. If the invalidation
fails (e.g. some process has CWD under the renamed
@@ -1153,6 +1150,87 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
stat->blksize = 1 << blkbits;
}
+static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr)
+{
+ memset(attr, 0, sizeof(*attr));
+ attr->ino = sx->ino;
+ attr->size = sx->size;
+ attr->blocks = sx->blocks;
+ attr->atime = sx->atime.tv_sec;
+ attr->mtime = sx->mtime.tv_sec;
+ attr->ctime = sx->ctime.tv_sec;
+ attr->atimensec = sx->atime.tv_nsec;
+ attr->mtimensec = sx->mtime.tv_nsec;
+ attr->ctimensec = sx->ctime.tv_nsec;
+ attr->mode = sx->mode;
+ attr->nlink = sx->nlink;
+ attr->uid = sx->uid;
+ attr->gid = sx->gid;
+ attr->rdev = new_encode_dev(MKDEV(sx->rdev_major, sx->rdev_minor));
+ attr->blksize = sx->blksize;
+}
+
+static int fuse_do_statx(struct inode *inode, struct file *file,
+ struct kstat *stat)
+{
+ int err;
+ struct fuse_attr attr;
+ struct fuse_statx *sx;
+ struct fuse_statx_in inarg;
+ struct fuse_statx_out outarg;
+ struct fuse_mount *fm = get_fuse_mount(inode);
+ u64 attr_version = fuse_get_attr_version(fm->fc);
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+ /* Directories have separate file-handle space */
+ if (file && S_ISREG(inode->i_mode)) {
+ struct fuse_file *ff = file->private_data;
+
+ inarg.getattr_flags |= FUSE_GETATTR_FH;
+ inarg.fh = ff->fh;
+ }
+ /* For now leave sync hints as the default, request all stats. */
+ inarg.sx_flags = 0;
+ inarg.sx_mask = STATX_BASIC_STATS | STATX_BTIME;
+ args.opcode = FUSE_STATX;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.out_numargs = 1;
+ args.out_args[0].size = sizeof(outarg);
+ args.out_args[0].value = &outarg;
+ err = fuse_simple_request(fm, &args);
+ if (err)
+ return err;
+
+ sx = &outarg.stat;
+ if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) ||
+ ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) ||
+ inode_wrong_type(inode, sx->mode)))) {
+ make_bad_inode(inode);
+ return -EIO;
+ }
+
+ fuse_statx_to_attr(&outarg.stat, &attr);
+ if ((sx->mask & STATX_BASIC_STATS) == STATX_BASIC_STATS) {
+ fuse_change_attributes(inode, &attr, &outarg.stat,
+ ATTR_TIMEOUT(&outarg), attr_version);
+ }
+
+ if (stat) {
+ stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME);
+ stat->btime.tv_sec = sx->btime.tv_sec;
+ stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1);
+ fuse_fillattr(inode, &attr, stat);
+ stat->result_mask |= STATX_TYPE;
+ }
+
+ return 0;
+}
+
static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
struct file *file)
{
@@ -1189,8 +1267,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
fuse_make_bad(inode);
err = -EIO;
} else {
- fuse_change_attributes(inode, &outarg.attr,
- attr_timeout(&outarg),
+ fuse_change_attributes(inode, &outarg.attr, NULL,
+ ATTR_TIMEOUT(&outarg),
attr_version);
if (stat)
fuse_fillattr(inode, &outarg.attr, stat);
@@ -1204,12 +1282,22 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
int err = 0;
bool sync;
u32 inval_mask = READ_ONCE(fi->inval_mask);
u32 cache_mask = fuse_get_cache_mask(inode);
- if (flags & AT_STATX_FORCE_SYNC)
+
+ /* FUSE only supports basic stats and possibly btime */
+ request_mask &= STATX_BASIC_STATS | STATX_BTIME;
+retry:
+ if (fc->no_statx)
+ request_mask &= STATX_BASIC_STATS;
+
+ if (!request_mask)
+ sync = false;
+ else if (flags & AT_STATX_FORCE_SYNC)
sync = true;
else if (flags & AT_STATX_DONT_SYNC)
sync = false;
@@ -1220,11 +1308,24 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
if (sync) {
forget_all_cached_acls(inode);
- err = fuse_do_getattr(inode, stat, file);
+ /* Try statx if BTIME is requested */
+ if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) {
+ err = fuse_do_statx(inode, file, stat);
+ if (err == -ENOSYS) {
+ fc->no_statx = 1;
+ goto retry;
+ }
+ } else {
+ err = fuse_do_getattr(inode, stat, file);
+ }
} else if (stat) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->mode = fi->orig_i_mode;
stat->ino = fi->orig_ino;
+ if (test_bit(FUSE_I_BTIME, &fi->state)) {
+ stat->btime = fi->i_btime;
+ stat->result_mask |= STATX_BTIME;
+ }
}
return err;
@@ -1715,8 +1816,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
inarg.mtimensec = inode->i_mtime.tv_nsec;
if (fm->fc->minor >= 23) {
inarg.valid |= FATTR_CTIME;
- inarg.ctime = inode->i_ctime.tv_sec;
- inarg.ctimensec = inode->i_ctime.tv_nsec;
+ inarg.ctime = inode_get_ctime(inode).tv_sec;
+ inarg.ctimensec = inode_get_ctime(inode).tv_nsec;
}
if (ff) {
inarg.valid |= FATTR_FH;
@@ -1857,12 +1958,12 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
if (attr->ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (attr->ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
/* FIXME: clear I_DIRTY_SYNC? */
}
- fuse_change_attributes_common(inode, &outarg.attr,
- attr_timeout(&outarg),
+ fuse_change_attributes_common(inode, &outarg.attr, NULL,
+ ATTR_TIMEOUT(&outarg),
fuse_get_cache_mask(inode));
oldsize = inode->i_size;
/* see the comment in fuse_change_attributes() */
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index bc4115288eec..1cdb6327511e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,7 +19,6 @@
#include <linux/uio.h>
#include <linux/fs.h>
#include <linux/filelock.h>
-#include <linux/file.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -479,36 +478,48 @@ static void fuse_sync_writes(struct inode *inode)
fuse_release_nowrite(inode);
}
-struct fuse_flush_args {
- struct fuse_args args;
- struct fuse_flush_in inarg;
- struct work_struct work;
- struct file *file;
-};
-
-static int fuse_do_flush(struct fuse_flush_args *fa)
+static int fuse_flush(struct file *file, fl_owner_t id)
{
- int err;
- struct inode *inode = file_inode(fa->file);
+ struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_flush_in inarg;
+ FUSE_ARGS(args);
+ int err;
+
+ if (fuse_is_bad(inode))
+ return -EIO;
+
+ if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
+ return 0;
err = write_inode_now(inode, 1);
if (err)
- goto out;
+ return err;
inode_lock(inode);
fuse_sync_writes(inode);
inode_unlock(inode);
- err = filemap_check_errors(fa->file->f_mapping);
+ err = filemap_check_errors(file->f_mapping);
if (err)
- goto out;
+ return err;
err = 0;
if (fm->fc->no_flush)
goto inval_attr_out;
- err = fuse_simple_request(fm, &fa->args);
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
+ args.opcode = FUSE_FLUSH;
+ args.nodeid = get_node_id(inode);
+ args.in_numargs = 1;
+ args.in_args[0].size = sizeof(inarg);
+ args.in_args[0].value = &inarg;
+ args.force = true;
+
+ err = fuse_simple_request(fm, &args);
if (err == -ENOSYS) {
fm->fc->no_flush = 1;
err = 0;
@@ -521,57 +532,9 @@ inval_attr_out:
*/
if (!err && fm->fc->writeback_cache)
fuse_invalidate_attr_mask(inode, STATX_BLOCKS);
-
-out:
- fput(fa->file);
- kfree(fa);
return err;
}
-static void fuse_flush_async(struct work_struct *work)
-{
- struct fuse_flush_args *fa = container_of(work, typeof(*fa), work);
-
- fuse_do_flush(fa);
-}
-
-static int fuse_flush(struct file *file, fl_owner_t id)
-{
- struct fuse_flush_args *fa;
- struct inode *inode = file_inode(file);
- struct fuse_mount *fm = get_fuse_mount(inode);
- struct fuse_file *ff = file->private_data;
-
- if (fuse_is_bad(inode))
- return -EIO;
-
- if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache)
- return 0;
-
- fa = kzalloc(sizeof(*fa), GFP_KERNEL);
- if (!fa)
- return -ENOMEM;
-
- fa->inarg.fh = ff->fh;
- fa->inarg.lock_owner = fuse_lock_owner_id(fm->fc, id);
- fa->args.opcode = FUSE_FLUSH;
- fa->args.nodeid = get_node_id(inode);
- fa->args.in_numargs = 1;
- fa->args.in_args[0].size = sizeof(fa->inarg);
- fa->args.in_args[0].value = &fa->inarg;
- fa->args.force = true;
- fa->file = get_file(file);
-
- /* Don't wait if the task is exiting */
- if (current->flags & PF_EXITING) {
- INIT_WORK(&fa->work, fuse_flush_async);
- schedule_work(&fa->work);
- return 0;
- }
-
- return fuse_do_flush(fa);
-}
-
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
int datasync, int opcode)
{
@@ -1465,7 +1428,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
int write = flags & FUSE_DIO_WRITE;
int cuse = flags & FUSE_DIO_CUSE;
struct file *file = io->iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fm->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
@@ -1477,12 +1441,20 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
int err = 0;
struct fuse_io_args *ia;
unsigned int max_pages;
+ bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO;
max_pages = iov_iter_npages(iter, fc->max_pages);
ia = fuse_io_alloc(io, max_pages);
if (!ia)
return -ENOMEM;
+ if (fopen_direct_io && fc->direct_io_relax) {
+ res = filemap_write_and_wait_range(mapping, pos, pos + count - 1);
+ if (res) {
+ fuse_io_free(ia);
+ return res;
+ }
+ }
if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
if (!write)
inode_lock(inode);
@@ -1491,6 +1463,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
inode_unlock(inode);
}
+ if (fopen_direct_io && write) {
+ res = invalidate_inode_pages2_range(mapping, idx_from, idx_to);
+ if (res) {
+ fuse_io_free(ia);
+ return res;
+ }
+ }
+
io->should_dirty = !write && user_backed_iter(iter);
while (count) {
ssize_t nres;
@@ -2478,14 +2458,17 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fm->fc;
/* DAX mmap is superior to direct_io mmap */
if (FUSE_IS_DAX(file_inode(file)))
return fuse_dax_mmap(file, vma);
if (ff->open_flags & FOPEN_DIRECT_IO) {
- /* Can't provide the coherency needed for MAP_SHARED */
- if (vma->vm_flags & VM_MAYSHARE)
+ /* Can't provide the coherency needed for MAP_SHARED
+ * if FUSE_DIRECT_IO_RELAX isn't set.
+ */
+ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_relax)
return -ENODEV;
invalidate_inode_pages2(file->f_mapping);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 9b7fc7d3c7f1..bf0b85d0b95c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -88,6 +88,9 @@ struct fuse_inode {
preserve the original mode */
umode_t orig_i_mode;
+ /* Cache birthtime */
+ struct timespec64 i_btime;
+
/** 64 bit inode number */
u64 orig_ino;
@@ -167,6 +170,8 @@ enum {
FUSE_I_SIZE_UNSTABLE,
/* Bad inode */
FUSE_I_BAD,
+ /* Has btime */
+ FUSE_I_BTIME,
};
struct fuse_conn;
@@ -792,6 +797,12 @@ struct fuse_conn {
/* Is tmpfile not implemented by fs? */
unsigned int no_tmpfile:1;
+ /* relax restrictions in FOPEN_DIRECT_IO mode */
+ unsigned int direct_io_relax:1;
+
+ /* Is statx not implemented by fs? */
+ unsigned int no_statx:1;
+
/** The number of requests waiting for completion */
atomic_t num_waiting;
@@ -1058,9 +1069,11 @@ void fuse_init_symlink(struct inode *inode);
* Change attributes of an inode
*/
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ struct fuse_statx *sx,
u64 attr_valid, u64 attr_version);
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ struct fuse_statx *sx,
u64 attr_valid, u32 cache_mask);
u32 fuse_get_cache_mask(struct inode *inode);
@@ -1111,7 +1124,10 @@ void fuse_invalidate_entry_cache(struct dentry *entry);
void fuse_invalidate_atime(struct inode *inode);
-u64 entry_attr_timeout(struct fuse_entry_out *o);
+u64 fuse_time_to_jiffies(u64 sec, u32 nsec);
+#define ATTR_TIMEOUT(o) \
+ fuse_time_to_jiffies((o)->attr_valid, (o)->attr_valid_nsec)
+
void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o);
/**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f19d748890f0..2e4eb7cf26fb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -77,7 +77,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
return NULL;
fi->i_time = 0;
- fi->inval_mask = 0;
+ fi->inval_mask = ~0;
fi->nodeid = 0;
fi->nlookup = 0;
fi->attr_version = 0;
@@ -163,6 +163,7 @@ static ino_t fuse_squash_ino(u64 ino64)
}
void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ struct fuse_statx *sx,
u64 attr_valid, u32 cache_mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -172,7 +173,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
fi->attr_version = atomic64_inc_return(&fc->attr_version);
fi->i_time = attr_valid;
- WRITE_ONCE(fi->inval_mask, 0);
+ /* Clear basic stats from invalid mask */
+ set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0);
inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -194,8 +196,26 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_mtime.tv_nsec = attr->mtimensec;
}
if (!(cache_mask & STATX_CTIME)) {
- inode->i_ctime.tv_sec = attr->ctime;
- inode->i_ctime.tv_nsec = attr->ctimensec;
+ inode_set_ctime(inode, attr->ctime, attr->ctimensec);
+ }
+ if (sx) {
+ /* Sanitize nsecs */
+ sx->btime.tv_nsec =
+ min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1);
+
+ /*
+ * Btime has been queried, cache is valid (whether or not btime
+ * is available or not) so clear STATX_BTIME from inval_mask.
+ *
+ * Availability of the btime attribute is indicated in
+ * FUSE_I_BTIME
+ */
+ set_mask_bits(&fi->inval_mask, STATX_BTIME, 0);
+ if (sx->mask & STATX_BTIME) {
+ set_bit(FUSE_I_BTIME, &fi->state);
+ fi->i_btime.tv_sec = sx->btime.tv_sec;
+ fi->i_btime.tv_nsec = sx->btime.tv_nsec;
+ }
}
if (attr->blksize != 0)
@@ -236,6 +256,7 @@ u32 fuse_get_cache_mask(struct inode *inode)
}
void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ struct fuse_statx *sx,
u64 attr_valid, u64 attr_version)
{
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -259,8 +280,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
attr->mtimensec = inode->i_mtime.tv_nsec;
}
if (cache_mask & STATX_CTIME) {
- attr->ctime = inode->i_ctime.tv_sec;
- attr->ctimensec = inode->i_ctime.tv_nsec;
+ attr->ctime = inode_get_ctime(inode).tv_sec;
+ attr->ctimensec = inode_get_ctime(inode).tv_nsec;
}
if ((attr_version != 0 && fi->attr_version > attr_version) ||
@@ -270,7 +291,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
}
old_mtime = inode->i_mtime;
- fuse_change_attributes_common(inode, attr, attr_valid, cache_mask);
+ fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask);
oldsize = inode->i_size;
/*
@@ -318,8 +339,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
inode->i_size = attr->size;
inode->i_mtime.tv_sec = attr->mtime;
inode->i_mtime.tv_nsec = attr->mtimensec;
- inode->i_ctime.tv_sec = attr->ctime;
- inode->i_ctime.tv_nsec = attr->ctimensec;
+ inode_set_ctime(inode, attr->ctime, attr->ctimensec);
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
fuse_init_file_inode(inode, attr->flags);
@@ -408,7 +428,7 @@ done:
spin_lock(&fi->lock);
fi->nlookup++;
spin_unlock(&fi->lock);
- fuse_change_attributes(inode, attr, attr_valid, attr_version);
+ fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version);
return inode;
}
@@ -1212,6 +1232,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->init_security = 1;
if (flags & FUSE_CREATE_SUPP_GROUP)
fc->create_supp_group = 1;
+ if (flags & FUSE_DIRECT_IO_RELAX)
+ fc->direct_io_relax = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -1258,7 +1280,7 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
- FUSE_HAS_EXPIRE_ONLY;
+ FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_RELAX;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
@@ -1401,16 +1423,18 @@ EXPORT_SYMBOL_GPL(fuse_dev_free);
static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
const struct fuse_inode *fi)
{
+ struct timespec64 ctime = inode_get_ctime(&fi->inode);
+
*attr = (struct fuse_attr){
.ino = fi->inode.i_ino,
.size = fi->inode.i_size,
.blocks = fi->inode.i_blocks,
.atime = fi->inode.i_atime.tv_sec,
.mtime = fi->inode.i_mtime.tv_sec,
- .ctime = fi->inode.i_ctime.tv_sec,
+ .ctime = ctime.tv_sec,
.atimensec = fi->inode.i_atime.tv_nsec,
.mtimensec = fi->inode.i_mtime.tv_nsec,
- .ctimensec = fi->inode.i_ctime.tv_nsec,
+ .ctimensec = ctime.tv_nsec,
.mode = fi->inode.i_mode,
.nlink = fi->inode.i_nlink,
.uid = fi->inode.i_uid.val,
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index dc603479b30e..9e6d587b3e67 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -223,8 +223,8 @@ retry:
spin_unlock(&fi->lock);
forget_all_cached_acls(inode);
- fuse_change_attributes(inode, &o->attr,
- entry_attr_timeout(o),
+ fuse_change_attributes(inode, &o->attr, NULL,
+ ATTR_TIMEOUT(o),
attr_version);
/*
* The other branch comes via fuse_iget()
@@ -232,7 +232,7 @@ retry:
*/
} else {
inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
- &o->attr, entry_attr_timeout(o),
+ &o->attr, ATTR_TIMEOUT(o),
attr_version);
if (!inode)
inode = ERR_PTR(-ENOMEM);
@@ -243,8 +243,16 @@ retry:
dput(dentry);
dentry = alias;
}
- if (IS_ERR(dentry))
+ if (IS_ERR(dentry)) {
+ if (!IS_ERR(inode)) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->nlookup--;
+ spin_unlock(&fi->lock);
+ }
return PTR_ERR(dentry);
+ }
}
if (fc->readdirplus_auto)
set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 03c966840422..be7f87a8e11a 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config GFS2_FS
tristate "GFS2 file system support"
+ select BUFFER_HEAD
select FS_POSIX_ACL
select CRC32
select LIBCRC32C
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index a392aa0f041d..443640e6fb9c 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -142,7 +142,7 @@ int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
ret = __gfs2_set_acl(inode, acl, type);
if (!ret && mode != inode->i_mode) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode->i_mode = mode;
mark_inode_dirty(inode);
}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ae49256b7c8c..c26d48355cc2 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -183,13 +183,13 @@ static int gfs2_writepages(struct address_space *mapping,
int ret;
/*
- * Even if we didn't write any pages here, we might still be holding
+ * Even if we didn't write enough pages here, we might still be holding
* dirty pages in the ail. We forcibly flush the ail because we don't
* want balance_dirty_pages() to loop indefinitely trying to write out
* pages held in the ail that it can't find.
*/
ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops);
- if (ret == 0)
+ if (ret == 0 && wbc->nr_to_write > 0)
set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
return ret;
}
@@ -272,8 +272,7 @@ continue_unlock:
* not be suitable for data integrity
* writeout).
*/
- *done_index = folio->index +
- folio_nr_pages(folio);
+ *done_index = folio_next_index(folio);
ret = 1;
break;
}
@@ -747,7 +746,7 @@ static const struct address_space_operations gfs2_aops = {
.writepages = gfs2_writepages,
.read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = gfs2_bmap,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 8d611fbcf0bd..ef7017fb6951 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -161,7 +161,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip)
int error;
down_write(&ip->i_rw_mutex);
- page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+ page = grab_cache_page(inode->i_mapping, 0);
error = -ENOMEM;
if (!page)
goto out;
@@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
if (status)
return ERR_PTR(status);
- folio = iomap_get_folio(iter, pos);
+ folio = iomap_get_folio(iter, pos, len);
if (IS_ERR(folio))
gfs2_trans_end(sdp);
return folio;
@@ -1386,7 +1386,7 @@ static int trunc_start(struct inode *inode, u64 newsize)
ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
i_size_write(inode, newsize);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_dinode_out(ip, dibh->b_data);
if (journaled)
@@ -1583,8 +1583,7 @@ out_unlock:
/* Every transaction boundary, we rewrite the dinode
to keep its di_blocks current in case of failure. */
- ip->i_inode.i_mtime = ip->i_inode.i_ctime =
- current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -1950,7 +1949,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
gfs2_statfs_change(sdp, 0, +btotal, 0);
gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
ip->i_inode.i_gid);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
up_write(&ip->i_rw_mutex);
@@ -1993,7 +1992,7 @@ static int trunc_end(struct gfs2_inode *ip)
gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
gfs2_ordered_del_inode(ip);
}
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -2094,7 +2093,7 @@ static int do_grow(struct inode *inode, u64 size)
goto do_end_trans;
truncate_setsize(inode, size);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 54a6d17b8c25..1a2afa88f8be 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
if (ip->i_inode.i_size < offset + size)
i_size_write(&ip->i_inode, offset + size);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_dinode_out(ip, dibh->b_data);
brelse(dibh);
@@ -227,7 +227,7 @@ out:
if (ip->i_inode.i_size < offset + copied)
i_size_write(&ip->i_inode, offset + copied);
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode);
gfs2_trans_add_meta(ip->i_gl, dibh);
gfs2_dinode_out(ip, dibh->b_data);
@@ -1814,7 +1814,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
- tv = current_time(&ip->i_inode);
+ tv = inode_set_ctime_current(&ip->i_inode);
if (ip->i_diskflags & GFS2_DIF_EXHASH) {
leaf = (struct gfs2_leaf *)bh->b_data;
be16_add_cpu(&leaf->lf_entries, 1);
@@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
da->bh = NULL;
brelse(bh);
ip->i_entries++;
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
+ ip->i_inode.i_mtime = tv;
if (S_ISDIR(nip->i_inode.i_mode))
inc_nlink(&ip->i_inode);
mark_inode_dirty(inode);
@@ -1876,7 +1876,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
const struct qstr *name = &dentry->d_name;
struct gfs2_dirent *dent, *prev = NULL;
struct buffer_head *bh;
- struct timespec64 tv = current_time(&dip->i_inode);
+ struct timespec64 tv;
/* Returns _either_ the entry (if its first in block) or the
previous entry otherwise */
@@ -1896,6 +1896,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
}
dirent_del(dip, bh, prev, dent);
+ tv = inode_set_ctime_current(&dip->i_inode);
if (dip->i_diskflags & GFS2_DIF_EXHASH) {
struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data;
u16 entries = be16_to_cpu(leaf->lf_entries);
@@ -1910,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
if (!dip->i_entries)
gfs2_consist_inode(dip);
dip->i_entries--;
- dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
+ dip->i_inode.i_mtime = tv;
if (d_is_dir(dentry))
drop_nlink(&dip->i_inode);
mark_inode_dirty(&dip->i_inode);
@@ -1951,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
dent->de_type = cpu_to_be16(new_type);
brelse(bh);
- dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode);
+ dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode);
mark_inode_dirty_sync(&dip->i_inode);
return 0;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b43fa8b8fc05..f2700477a300 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -260,7 +260,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask)
error = gfs2_meta_inode_buffer(ip, &bh);
if (error)
goto out_trans_end;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
gfs2_trans_add_meta(ip->i_gl, bh);
ip->i_diskflags = new_flags;
gfs2_dinode_out(ip, bh->b_data);
@@ -432,7 +432,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
err = gfs2_glock_nq(&gh);
if (err) {
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_uninit;
}
@@ -474,7 +474,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
err = gfs2_rindex_update(sdp);
if (err) {
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_unlock;
}
@@ -482,12 +482,12 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
ap.target = data_blocks + ind_blocks;
err = gfs2_quota_lock_check(ip, &ap);
if (err) {
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_unlock;
}
err = gfs2_inplace_reserve(ip, &ap);
if (err) {
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_quota_unlock;
}
@@ -500,7 +500,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
}
err = gfs2_trans_begin(sdp, rblocks, 0);
if (err) {
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_trans_fail;
}
@@ -508,7 +508,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
if (gfs2_is_stuffed(ip)) {
err = gfs2_unstuff_dinode(ip);
if (err) {
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_trans_end;
}
}
@@ -524,7 +524,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
err = gfs2_allocate_page_backing(page, length);
if (err)
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
out_page_locked:
if (ret != VM_FAULT_LOCKED)
@@ -558,7 +558,7 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf)
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
err = gfs2_glock_nq(&gh);
if (err) {
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_uninit;
}
ret = filemap_fault(vmf);
@@ -1436,17 +1436,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (cmd == F_CANCELLK) {
- /* Hack: */
- cmd = F_SETLK;
- fl->fl_type = F_UNLCK;
- }
if (unlikely(gfs2_withdrawn(sdp))) {
if (fl->fl_type == F_UNLCK)
locks_lock_file_wait(file, fl);
return -EIO;
}
- if (IS_GETLK(cmd))
+ if (cmd == F_CANCELLK)
+ return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
+ else if (IS_GETLK(cmd))
return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
else if (fl->fl_type == F_UNLCK)
return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 1438e7465e30..4a280be229a6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -176,7 +176,7 @@ void gfs2_glock_free(struct gfs2_glock *gl)
wake_up_glock(gl);
call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
- wake_up(&sdp->sd_glock_wait);
+ wake_up(&sdp->sd_kill_wait);
}
/**
@@ -468,10 +468,10 @@ done:
* do_promote - promote as many requests as possible on the current queue
* @gl: The glock
*
- * Returns: 1 if there is a blocked holder at the head of the list
+ * Returns true on success (i.e., progress was made or there are no waiters).
*/
-static int do_promote(struct gfs2_glock *gl)
+static bool do_promote(struct gfs2_glock *gl)
{
struct gfs2_holder *gh, *current_gh;
@@ -484,10 +484,10 @@ static int do_promote(struct gfs2_glock *gl)
* If we get here, it means we may not grant this
* holder for some reason. If this holder is at the
* head of the list, it means we have a blocked holder
- * at the head, so return 1.
+ * at the head, so return false.
*/
if (list_is_first(&gh->gh_list, &gl->gl_holders))
- return 1;
+ return false;
do_error(gl, 0);
break;
}
@@ -497,7 +497,7 @@ static int do_promote(struct gfs2_glock *gl)
if (!current_gh)
current_gh = gh;
}
- return 0;
+ return true;
}
/**
@@ -591,10 +591,11 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
/* move to back of queue and try next entry */
if (ret & LM_OUT_CANCELED) {
- if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
- list_move_tail(&gh->gh_list, &gl->gl_holders);
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
+ if (do_promote(gl))
+ goto out;
goto retry;
}
/* Some error or failed "try lock" - report it */
@@ -679,8 +680,7 @@ __acquires(&gl->gl_lockref.lock)
gh && !(gh->gh_flags & LM_FLAG_NOEXP))
goto skip_inval;
- lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
- LM_FLAG_PRIORITY);
+ lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP);
GLOCK_BUG_ON(gl, gl->gl_state == target);
GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target);
if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
@@ -834,7 +834,7 @@ __acquires(&gl->gl_lockref.lock)
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
- if (do_promote(gl) == 0)
+ if (do_promote(gl))
goto out_unlock;
gh = find_first_waiter(gl);
gl->gl_target = gh->gh_state;
@@ -1022,7 +1022,7 @@ static void delete_work_func(struct work_struct *work)
* step entirely.
*/
if (gfs2_try_evict(gl)) {
- if (test_bit(SDF_DEACTIVATING, &sdp->sd_flags))
+ if (test_bit(SDF_KILL, &sdp->sd_flags))
goto out;
if (gfs2_queue_verify_evict(gl))
return;
@@ -1035,7 +1035,7 @@ static void delete_work_func(struct work_struct *work)
GFS2_BLKST_UNLINKED);
if (IS_ERR(inode)) {
if (PTR_ERR(inode) == -EAGAIN &&
- !test_bit(SDF_DEACTIVATING, &sdp->sd_flags) &&
+ !test_bit(SDF_KILL, &sdp->sd_flags) &&
gfs2_queue_verify_evict(gl))
return;
} else {
@@ -1231,7 +1231,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
out_free:
gfs2_glock_dealloc(&gl->gl_rcu);
if (atomic_dec_and_test(&sdp->sd_glock_disposal))
- wake_up(&sdp->sd_glock_wait);
+ wake_up(&sdp->sd_kill_wait);
out:
return ret;
@@ -1515,27 +1515,20 @@ fail:
}
if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
continue;
- if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
- insert_pt = &gh2->gh_list;
}
trace_gfs2_glock_queue(gh, 1);
gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
if (likely(insert_pt == NULL)) {
list_add_tail(&gh->gh_list, &gl->gl_holders);
- if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
- goto do_cancel;
return;
}
list_add_tail(&gh->gh_list, insert_pt);
-do_cancel:
gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
- if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
- spin_unlock(&gl->gl_lockref.lock);
- if (sdp->sd_lockstruct.ls_ops->lm_cancel)
- sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
- spin_lock(&gl->gl_lockref.lock);
- }
+ spin_unlock(&gl->gl_lockref.lock);
+ if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+ sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
+ spin_lock(&gl->gl_lockref.lock);
return;
trap_recursive:
@@ -2017,7 +2010,9 @@ static long gfs2_scan_glock_lru(int nr)
if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
if (!spin_trylock(&gl->gl_lockref.lock))
continue;
- if (!gl->gl_lockref.count) {
+ if (gl->gl_lockref.count <= 1 &&
+ (gl->gl_state == LM_ST_UNLOCKED ||
+ demote_ok(gl))) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
freed++;
@@ -2195,7 +2190,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
flush_workqueue(glock_workqueue);
glock_hash_walk(clear_glock, sdp);
flush_workqueue(glock_workqueue);
- wait_event_timeout(sdp->sd_glock_wait,
+ wait_event_timeout(sdp->sd_kill_wait,
atomic_read(&sdp->sd_glock_disposal) == 0,
HZ * 600);
glock_hash_walk(dump_glock_func, sdp);
@@ -2227,8 +2222,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'e';
if (flags & LM_FLAG_ANY)
*p++ = 'A';
- if (flags & LM_FLAG_PRIORITY)
- *p++ = 'p';
if (flags & LM_FLAG_NODE_SCOPE)
*p++ = 'n';
if (flags & GL_ASYNC)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 1f1ba92c15a8..c8685ca7d2a2 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -68,14 +68,6 @@ enum {
* also be granted in SHARED. The preferred state is whichever is compatible
* with other granted locks, or the specified state if no other locks exist.
*
- * LM_FLAG_PRIORITY
- * Override fairness considerations. Suppose a lock is held in a shared state
- * and there is a pending request for the deferred state. A shared lock
- * request with the priority flag would be allowed to bypass the deferred
- * request and directly join the other shared lock. A shared lock request
- * without the priority flag might be forced to wait until the deferred
- * requested had acquired and released the lock.
- *
* LM_FLAG_NODE_SCOPE
* This holder agrees to share the lock within this node. In other words,
* the glock is held in EX mode according to DLM, but local holders on the
@@ -86,7 +78,6 @@ enum {
#define LM_FLAG_TRY_1CB 0x0002
#define LM_FLAG_NOEXP 0x0004
#define LM_FLAG_ANY 0x0008
-#define LM_FLAG_PRIORITY 0x0010
#define LM_FLAG_NODE_SCOPE 0x0020
#define GL_ASYNC 0x0040
#define GL_EXACT 0x0080
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 54319328b16b..f41ca89d216b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -437,8 +437,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
inode->i_atime = atime;
inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
- inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
- inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+ inode_set_ctime(inode, be64_to_cpu(str->di_ctime),
+ be32_to_cpu(str->di_ctime_nsec));
ip->i_goal = be64_to_cpu(str->di_goal_meta);
ip->i_generation = be64_to_cpu(str->di_generation);
@@ -567,15 +567,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote)
struct super_block *sb = sdp->sd_vfs;
if (!remote ||
- gl->gl_state != LM_ST_SHARED ||
+ (gl->gl_state != LM_ST_SHARED &&
+ gl->gl_state != LM_ST_UNLOCKED) ||
gl->gl_demote_state != LM_ST_UNLOCKED)
return;
/*
* Try to get an active super block reference to prevent racing with
- * unmount (see trylock_super()). But note that unmount isn't the only
- * place where a write lock on s_umount is taken, and we can fail here
- * because of things like remount as well.
+ * unmount (see super_trylock_shared()). But note that unmount isn't
+ * the only place where a write lock on s_umount is taken, and we can
+ * fail here because of things like remount as well.
*/
if (down_read_trylock(&sb->s_umount)) {
atomic_inc(&sb->s_active);
@@ -637,7 +638,7 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
if (!remote || sb_rdonly(sdp->sd_vfs) ||
- test_bit(SDF_DEACTIVATING, &sdp->sd_flags))
+ test_bit(SDF_KILL, &sdp->sd_flags))
return;
if (gl->gl_demote_state == LM_ST_UNLOCKED &&
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 04f2d78e8658..a8c95c5293c6 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -452,7 +452,7 @@ struct gfs2_quota_data {
s64 qd_change_sync;
unsigned int qd_slot;
- unsigned int qd_slot_count;
+ unsigned int qd_slot_ref;
struct buffer_head *qd_bh;
struct gfs2_quota_change *qd_bh_qc;
@@ -537,6 +537,7 @@ struct gfs2_statfs_change_host {
#define GFS2_QUOTA_OFF 0
#define GFS2_QUOTA_ACCOUNT 1
#define GFS2_QUOTA_ON 2
+#define GFS2_QUOTA_QUIET 3 /* on but not complaining */
#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED
#define GFS2_DATA_WRITEBACK 1
@@ -606,7 +607,7 @@ enum {
SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */
SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are
withdrawing */
- SDF_DEACTIVATING = 15,
+ SDF_KILL = 15,
SDF_EVICTING = 16,
SDF_FROZEN = 17,
};
@@ -716,7 +717,7 @@ struct gfs2_sbd {
struct gfs2_glock *sd_rename_gl;
struct gfs2_glock *sd_freeze_gl;
struct work_struct sd_freeze_work;
- wait_queue_head_t sd_glock_wait;
+ wait_queue_head_t sd_kill_wait;
wait_queue_head_t sd_async_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 17c994a0c0d0..0eac04507904 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -276,10 +276,16 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
* gfs2_lookup_simple callers expect ENOENT
* and do not check for NULL.
*/
- if (inode == NULL)
- return ERR_PTR(-ENOENT);
- else
- return inode;
+ if (IS_ERR_OR_NULL(inode))
+ return inode ? inode : ERR_PTR(-ENOENT);
+
+ /*
+ * Must not call back into the filesystem when allocating
+ * pages in the metadata inode's address space.
+ */
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+
+ return inode;
}
@@ -690,7 +696,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
inode->i_rdev = dev;
inode->i_size = size;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
munge_mode_uid_gid(dip, inode);
check_and_update_goal(dip);
ip->i_goal = dip->i_goal;
@@ -1029,7 +1035,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
gfs2_trans_add_meta(ip->i_gl, dibh);
inc_nlink(&ip->i_inode);
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
ihold(inode);
d_instantiate(dentry, inode);
mark_inode_dirty(inode);
@@ -1114,7 +1120,7 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip,
return error;
ip->i_entries = 0;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (S_ISDIR(inode->i_mode))
clear_nlink(inode);
else
@@ -1371,7 +1377,7 @@ static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip,
if (dir_rename)
return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
mark_inode_dirty_sync(&ip->i_inode);
return 0;
}
@@ -2071,7 +2077,7 @@ static int gfs2_getattr(struct mnt_idmap *idmap,
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (gfs2_holder_initialized(&gh))
gfs2_glock_dq_uninit(&gh);
@@ -2139,8 +2145,7 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset)
return vfs_setpos(file, ret, inode->i_sb->s_maxbytes);
}
-static int gfs2_update_time(struct inode *inode, struct timespec64 *time,
- int flags)
+static int gfs2_update_time(struct inode *inode, int flags)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_glock *gl = ip->i_gl;
@@ -2155,7 +2160,8 @@ static int gfs2_update_time(struct inode *inode, struct timespec64 *time,
if (error)
return error;
}
- return generic_update_time(inode, time, flags);
+ generic_update_time(inode, flags);
+ return 0;
}
static const struct inode_operations gfs2_file_iops = {
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 54911294687c..59ab18c79889 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -222,11 +222,6 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
lkf |= DLM_LKF_NOQUEUEBAST;
}
- if (gfs_flags & LM_FLAG_PRIORITY) {
- lkf |= DLM_LKF_NOORDER;
- lkf |= DLM_LKF_HEADQUE;
- }
-
if (gfs_flags & LM_FLAG_ANY) {
if (req == DLM_LOCK_PR)
lkf |= DLM_LKF_ALTCW;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index aa568796207c..e5271ae87d1c 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -1227,6 +1227,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_log_unlock(sdp);
}
+static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
+{
+ return atomic_read(&sdp->sd_log_pinned) +
+ atomic_read(&sdp->sd_log_blks_needed) >=
+ atomic_read(&sdp->sd_log_thresh1);
+}
+
+static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
+{
+ return sdp->sd_jdesc->jd_blocks -
+ atomic_read(&sdp->sd_log_blks_free) +
+ atomic_read(&sdp->sd_log_blks_needed) >=
+ atomic_read(&sdp->sd_log_thresh2);
+}
+
/**
* gfs2_log_commit - Commit a transaction to the log
* @sdp: the filesystem
@@ -1246,9 +1261,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
log_refund(sdp, tr);
- if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
- ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
- atomic_read(&sdp->sd_log_thresh2)))
+ if (gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp))
wake_up(&sdp->sd_logd_waitq);
}
@@ -1271,24 +1284,6 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp)
gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
}
-static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
-{
- return (atomic_read(&sdp->sd_log_pinned) +
- atomic_read(&sdp->sd_log_blks_needed) >=
- atomic_read(&sdp->sd_log_thresh1));
-}
-
-static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
-{
- unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
-
- if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags))
- return 1;
-
- return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >=
- atomic_read(&sdp->sd_log_thresh2);
-}
-
/**
* gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
* @data: Pointer to GFS2 superblock
@@ -1301,14 +1296,11 @@ int gfs2_logd(void *data)
{
struct gfs2_sbd *sdp = data;
unsigned long t = 1;
- DEFINE_WAIT(wait);
while (!kthread_should_stop()) {
+ if (gfs2_withdrawn(sdp))
+ break;
- if (gfs2_withdrawn(sdp)) {
- msleep_interruptible(HZ);
- continue;
- }
/* Check for errors writing to the journal */
if (sdp->sd_log_error) {
gfs2_lm(sdp,
@@ -1317,7 +1309,7 @@ int gfs2_logd(void *data)
"prevent further damage.\n",
sdp->sd_fsname, sdp->sd_log_error);
gfs2_withdraw(sdp);
- continue;
+ break;
}
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
@@ -1326,7 +1318,9 @@ int gfs2_logd(void *data)
GFS2_LFC_LOGD_JFLUSH_REQD);
}
- if (gfs2_ail_flush_reqd(sdp)) {
+ if (test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) ||
+ gfs2_ail_flush_reqd(sdp)) {
+ clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
gfs2_ail1_empty(sdp, 0);
@@ -1338,17 +1332,14 @@ int gfs2_logd(void *data)
try_to_freeze();
- do {
- prepare_to_wait(&sdp->sd_logd_waitq, &wait,
- TASK_INTERRUPTIBLE);
- if (!gfs2_ail_flush_reqd(sdp) &&
- !gfs2_jrnl_flush_reqd(sdp) &&
- !kthread_should_stop())
- t = schedule_timeout(t);
- } while(t && !gfs2_ail_flush_reqd(sdp) &&
- !gfs2_jrnl_flush_reqd(sdp) &&
- !kthread_should_stop());
- finish_wait(&sdp->sd_logd_waitq, &wait);
+ t = wait_event_interruptible_timeout(sdp->sd_logd_waitq,
+ test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) ||
+ gfs2_ail_flush_reqd(sdp) ||
+ gfs2_jrnl_flush_reqd(sdp) ||
+ sdp->sd_log_error ||
+ gfs2_withdrawn(sdp) ||
+ kthread_should_stop(),
+ t);
}
return 0;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 251322b01631..483f69807062 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -456,7 +456,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
* Find the folio with 'index' in the journal's mapping. Search the folio for
* the journal head if requested (cleanup == false). Release refs on the
* folio so the page cache can reclaim it. We grabbed a
- * reference on this folio twice, first when we did a find_or_create_page()
+ * reference on this folio twice, first when we did a grab_cache_page()
* to obtain the folio to add it to the bio and second when we do a
* filemap_get_folio() here to get the folio to wait on while I/O on it is being
* completed.
@@ -481,7 +481,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
if (!*done)
*done = gfs2_jhead_pg_srch(jd, head, &folio->page);
- /* filemap_get_folio() and the earlier find_or_create_page() */
+ /* filemap_get_folio() and the earlier grab_cache_page() */
folio_put_refs(folio, 2);
}
@@ -535,8 +535,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
for (; block < je->lblock + je->blocks; block++, dblock++) {
if (!page) {
- page = find_or_create_page(mapping,
- block >> shift, GFP_NOFS);
+ page = grab_cache_page(mapping, block >> shift);
if (!page) {
ret = -ENOMEM;
done = true;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index afcb32854f14..66eb98b690a2 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -152,9 +152,9 @@ static int __init init_gfs2_fs(void)
goto fail_shrinker;
error = -ENOMEM;
- gfs_recovery_wq = alloc_workqueue("gfs_recovery",
+ gfs2_recovery_wq = alloc_workqueue("gfs2_recovery",
WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
- if (!gfs_recovery_wq)
+ if (!gfs2_recovery_wq)
goto fail_wq1;
gfs2_control_wq = alloc_workqueue("gfs2_control",
@@ -162,7 +162,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_control_wq)
goto fail_wq2;
- gfs2_freeze_wq = alloc_workqueue("freeze_workqueue", 0, 0);
+ gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0);
if (!gfs2_freeze_wq)
goto fail_wq3;
@@ -194,7 +194,7 @@ fail_mempool:
fail_wq3:
destroy_workqueue(gfs2_control_wq);
fail_wq2:
- destroy_workqueue(gfs_recovery_wq);
+ destroy_workqueue(gfs2_recovery_wq);
fail_wq1:
unregister_shrinker(&gfs2_qd_shrinker);
fail_shrinker:
@@ -234,7 +234,7 @@ static void __exit exit_gfs2_fs(void)
gfs2_unregister_debugfs();
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
- destroy_workqueue(gfs_recovery_wq);
+ destroy_workqueue(gfs2_recovery_wq);
destroy_workqueue(gfs2_control_wq);
destroy_workqueue(gfs2_freeze_wq);
list_lru_destroy(&gfs2_qd_lru);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8a27957dbfee..33ca04733e93 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -87,7 +87,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
gfs2_tune_init(&sdp->sd_tune);
- init_waitqueue_head(&sdp->sd_glock_wait);
+ init_waitqueue_head(&sdp->sd_kill_wait);
init_waitqueue_head(&sdp->sd_async_glock_wait);
atomic_set(&sdp->sd_glock_disposal, 0);
init_completion(&sdp->sd_locking_init);
@@ -1103,29 +1103,49 @@ static int init_threads(struct gfs2_sbd *sdp)
struct task_struct *p;
int error = 0;
- p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+ p = kthread_create(gfs2_logd, sdp, "gfs2_logd/%s", sdp->sd_fsname);
if (IS_ERR(p)) {
error = PTR_ERR(p);
- fs_err(sdp, "can't start logd thread: %d\n", error);
+ fs_err(sdp, "can't create logd thread: %d\n", error);
return error;
}
+ get_task_struct(p);
sdp->sd_logd_process = p;
- p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+ p = kthread_create(gfs2_quotad, sdp, "gfs2_quotad/%s", sdp->sd_fsname);
if (IS_ERR(p)) {
error = PTR_ERR(p);
- fs_err(sdp, "can't start quotad thread: %d\n", error);
+ fs_err(sdp, "can't create quotad thread: %d\n", error);
goto fail;
}
+ get_task_struct(p);
sdp->sd_quotad_process = p;
+
+ wake_up_process(sdp->sd_logd_process);
+ wake_up_process(sdp->sd_quotad_process);
return 0;
fail:
kthread_stop(sdp->sd_logd_process);
+ put_task_struct(sdp->sd_logd_process);
sdp->sd_logd_process = NULL;
return error;
}
+void gfs2_destroy_threads(struct gfs2_sbd *sdp)
+{
+ if (sdp->sd_logd_process) {
+ kthread_stop(sdp->sd_logd_process);
+ put_task_struct(sdp->sd_logd_process);
+ sdp->sd_logd_process = NULL;
+ }
+ if (sdp->sd_quotad_process) {
+ kthread_stop(sdp->sd_quotad_process);
+ put_task_struct(sdp->sd_quotad_process);
+ sdp->sd_quotad_process = NULL;
+ }
+}
+
/**
* gfs2_fill_super - Read in superblock
* @sb: The VFS superblock
@@ -1276,12 +1296,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
if (error) {
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
- if (sdp->sd_quotad_process)
- kthread_stop(sdp->sd_quotad_process);
- sdp->sd_quotad_process = NULL;
- if (sdp->sd_logd_process)
- kthread_stop(sdp->sd_logd_process);
- sdp->sd_logd_process = NULL;
+ gfs2_destroy_threads(sdp);
fs_err(sdp, "can't make FS RW: %d\n", error);
goto fail_per_node;
}
@@ -1381,6 +1396,7 @@ static const struct constant_table gfs2_param_quota[] = {
{"off", GFS2_QUOTA_OFF},
{"account", GFS2_QUOTA_ACCOUNT},
{"on", GFS2_QUOTA_ON},
+ {"quiet", GFS2_QUOTA_QUIET},
{}
};
@@ -1786,9 +1802,9 @@ static void gfs2_kill_sb(struct super_block *sb)
/*
* Flush and then drain the delete workqueue here (via
* destroy_workqueue()) to ensure that any delete work that
- * may be running will also see the SDF_DEACTIVATING flag.
+ * may be running will also see the SDF_KILL flag.
*/
- set_bit(SDF_DEACTIVATING, &sdp->sd_flags);
+ set_bit(SDF_KILL, &sdp->sd_flags);
gfs2_flush_delete_work(sdp);
destroy_workqueue(sdp->sd_delete_wq);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 704192b73605..171b2713d2e5 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -109,38 +109,44 @@ static inline void spin_unlock_bucket(unsigned int hash)
static void gfs2_qd_dealloc(struct rcu_head *rcu)
{
struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
+ struct gfs2_sbd *sdp = qd->qd_sbd;
+
kmem_cache_free(gfs2_quotad_cachep, qd);
+ if (atomic_dec_and_test(&sdp->sd_quota_count))
+ wake_up(&sdp->sd_kill_wait);
}
-static void gfs2_qd_dispose(struct list_head *list)
+static void gfs2_qd_dispose(struct gfs2_quota_data *qd)
{
- struct gfs2_quota_data *qd;
- struct gfs2_sbd *sdp;
-
- while (!list_empty(list)) {
- qd = list_first_entry(list, struct gfs2_quota_data, qd_lru);
- sdp = qd->qd_gl->gl_name.ln_sbd;
-
- list_del(&qd->qd_lru);
+ struct gfs2_sbd *sdp = qd->qd_sbd;
- /* Free from the filesystem-specific list */
- spin_lock(&qd_lock);
- list_del(&qd->qd_list);
- spin_unlock(&qd_lock);
+ spin_lock(&qd_lock);
+ list_del(&qd->qd_list);
+ spin_unlock(&qd_lock);
- spin_lock_bucket(qd->qd_hash);
- hlist_bl_del_rcu(&qd->qd_hlist);
- spin_unlock_bucket(qd->qd_hash);
+ spin_lock_bucket(qd->qd_hash);
+ hlist_bl_del_rcu(&qd->qd_hlist);
+ spin_unlock_bucket(qd->qd_hash);
+ if (!gfs2_withdrawn(sdp)) {
gfs2_assert_warn(sdp, !qd->qd_change);
- gfs2_assert_warn(sdp, !qd->qd_slot_count);
+ gfs2_assert_warn(sdp, !qd->qd_slot_ref);
gfs2_assert_warn(sdp, !qd->qd_bh_count);
+ }
- gfs2_glock_put(qd->qd_gl);
- atomic_dec(&sdp->sd_quota_count);
+ gfs2_glock_put(qd->qd_gl);
+ call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
+}
- /* Delete it from the common reclaim list */
- call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
+static void gfs2_qd_list_dispose(struct list_head *list)
+{
+ struct gfs2_quota_data *qd;
+
+ while (!list_empty(list)) {
+ qd = list_first_entry(list, struct gfs2_quota_data, qd_lru);
+ list_del(&qd->qd_lru);
+
+ gfs2_qd_dispose(qd);
}
}
@@ -149,18 +155,22 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item,
struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *dispose = arg;
- struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
+ struct gfs2_quota_data *qd =
+ list_entry(item, struct gfs2_quota_data, qd_lru);
+ enum lru_status status;
if (!spin_trylock(&qd->qd_lockref.lock))
return LRU_SKIP;
+ status = LRU_SKIP;
if (qd->qd_lockref.count == 0) {
lockref_mark_dead(&qd->qd_lockref);
list_lru_isolate_move(lru, &qd->qd_lru, dispose);
+ status = LRU_REMOVED;
}
spin_unlock(&qd->qd_lockref.lock);
- return LRU_REMOVED;
+ return status;
}
static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
@@ -175,7 +185,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
gfs2_qd_isolate, &dispose);
- gfs2_qd_dispose(&dispose);
+ gfs2_qd_list_dispose(&dispose);
return freed;
}
@@ -203,12 +213,7 @@ static u64 qd2index(struct gfs2_quota_data *qd)
static u64 qd2offset(struct gfs2_quota_data *qd)
{
- u64 offset;
-
- offset = qd2index(qd);
- offset *= sizeof(struct gfs2_quota);
-
- return offset;
+ return qd2index(qd) * sizeof(struct gfs2_quota);
}
static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
@@ -221,7 +226,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str
return NULL;
qd->qd_sbd = sdp;
- qd->qd_lockref.count = 1;
+ qd->qd_lockref.count = 0;
spin_lock_init(&qd->qd_lockref.lock);
qd->qd_id = qid;
qd->qd_slot = -1;
@@ -283,6 +288,7 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
spin_lock_bucket(hash);
*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
if (qd == NULL) {
+ new_qd->qd_lockref.count++;
*qdp = new_qd;
list_add(&new_qd->qd_list, &sdp->sd_quota_list);
hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
@@ -302,20 +308,31 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
static void qd_hold(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
lockref_get(&qd->qd_lockref);
}
static void qd_put(struct gfs2_quota_data *qd)
{
+ struct gfs2_sbd *sdp;
+
if (lockref_put_or_lock(&qd->qd_lockref))
return;
+ BUG_ON(__lockref_is_dead(&qd->qd_lockref));
+ sdp = qd->qd_sbd;
+ if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) {
+ lockref_mark_dead(&qd->qd_lockref);
+ spin_unlock(&qd->qd_lockref.lock);
+
+ gfs2_qd_dispose(qd);
+ return;
+ }
+
qd->qd_lockref.count = 0;
list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
spin_unlock(&qd->qd_lockref.lock);
-
}
static int slot_get(struct gfs2_quota_data *qd)
@@ -325,20 +342,19 @@ static int slot_get(struct gfs2_quota_data *qd)
int error = 0;
spin_lock(&sdp->sd_bitmap_lock);
- if (qd->qd_slot_count != 0)
- goto out;
-
- error = -ENOSPC;
- bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
- if (bit < sdp->sd_quota_slots) {
+ if (qd->qd_slot_ref == 0) {
+ bit = find_first_zero_bit(sdp->sd_quota_bitmap,
+ sdp->sd_quota_slots);
+ if (bit >= sdp->sd_quota_slots) {
+ error = -ENOSPC;
+ goto out;
+ }
set_bit(bit, sdp->sd_quota_bitmap);
qd->qd_slot = bit;
- error = 0;
-out:
- qd->qd_slot_count++;
}
+ qd->qd_slot_ref++;
+out:
spin_unlock(&sdp->sd_bitmap_lock);
-
return error;
}
@@ -347,8 +363,8 @@ static void slot_hold(struct gfs2_quota_data *qd)
struct gfs2_sbd *sdp = qd->qd_sbd;
spin_lock(&sdp->sd_bitmap_lock);
- gfs2_assert(sdp, qd->qd_slot_count);
- qd->qd_slot_count++;
+ gfs2_assert(sdp, qd->qd_slot_ref);
+ qd->qd_slot_ref++;
spin_unlock(&sdp->sd_bitmap_lock);
}
@@ -357,8 +373,8 @@ static void slot_put(struct gfs2_quota_data *qd)
struct gfs2_sbd *sdp = qd->qd_sbd;
spin_lock(&sdp->sd_bitmap_lock);
- gfs2_assert(sdp, qd->qd_slot_count);
- if (!--qd->qd_slot_count) {
+ gfs2_assert(sdp, qd->qd_slot_ref);
+ if (!--qd->qd_slot_ref) {
BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
qd->qd_slot = -1;
}
@@ -367,7 +383,7 @@ static void slot_put(struct gfs2_quota_data *qd)
static int bh_get(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
struct inode *inode = sdp->sd_qc_inode;
struct gfs2_inode *ip = GFS2_I(inode);
unsigned int block, offset;
@@ -421,7 +437,7 @@ fail:
static void bh_put(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
mutex_lock(&sdp->sd_quota_mutex);
gfs2_assert(sdp, qd->qd_bh_count);
@@ -451,6 +467,20 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
return 1;
}
+static int qd_bh_get_or_undo(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
+{
+ int error;
+
+ error = bh_get(qd);
+ if (!error)
+ return 0;
+
+ clear_bit(QDF_LOCKED, &qd->qd_flags);
+ slot_put(qd);
+ qd_put(qd);
+ return error;
+}
+
static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd = NULL, *iter;
@@ -473,30 +503,29 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
spin_unlock(&qd_lock);
if (qd) {
- error = bh_get(qd);
- if (error) {
- clear_bit(QDF_LOCKED, &qd->qd_flags);
- slot_put(qd);
- qd_put(qd);
+ error = qd_bh_get_or_undo(sdp, qd);
+ if (error)
return error;
- }
+ *qdp = qd;
}
- *qdp = qd;
-
return 0;
}
-static void qd_unlock(struct gfs2_quota_data *qd)
+static void qdsb_put(struct gfs2_quota_data *qd)
{
- gfs2_assert_warn(qd->qd_gl->gl_name.ln_sbd,
- test_bit(QDF_LOCKED, &qd->qd_flags));
- clear_bit(QDF_LOCKED, &qd->qd_flags);
bh_put(qd);
slot_put(qd);
qd_put(qd);
}
+static void qd_unlock(struct gfs2_quota_data *qd)
+{
+ gfs2_assert_warn(qd->qd_sbd, test_bit(QDF_LOCKED, &qd->qd_flags));
+ clear_bit(QDF_LOCKED, &qd->qd_flags);
+ qdsb_put(qd);
+}
+
static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid,
struct gfs2_quota_data **qdp)
{
@@ -523,13 +552,6 @@ fail:
return error;
}
-static void qdsb_put(struct gfs2_quota_data *qd)
-{
- bh_put(qd);
- slot_put(qd);
- qd_put(qd);
-}
-
/**
* gfs2_qa_get - make sure we have a quota allocations data structure,
* if necessary
@@ -666,7 +688,7 @@ static int sort_qd(const void *a, const void *b)
static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
struct gfs2_quota_change *qc = qd->qd_bh_qc;
s64 x;
@@ -708,30 +730,29 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type)
mutex_unlock(&sdp->sd_quota_mutex);
}
-static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
+static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index,
unsigned off, void *buf, unsigned bytes)
{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct inode *inode = &ip->i_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
struct address_space *mapping = inode->i_mapping;
struct page *page;
struct buffer_head *bh;
u64 blk;
unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0;
unsigned to_write = bytes, pg_off = off;
- int done = 0;
blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift);
boff = off % bsize;
- page = find_or_create_page(mapping, index, GFP_NOFS);
+ page = grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
if (!page_has_buffers(page))
create_empty_buffers(page, bsize, 0);
bh = page_buffers(page);
- while (!done) {
+ for(;;) {
/* Find the beginning block within the page */
if (pg_off >= ((bnum * bsize) + bsize)) {
bh = bh->b_this_page;
@@ -751,10 +772,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
set_buffer_uptodate(bh);
if (bh_read(bh, REQ_META | REQ_PRIO) < 0)
goto unlock_out;
- if (gfs2_is_jdata(ip))
- gfs2_trans_add_data(ip->i_gl, bh);
- else
- gfs2_ordered_add_inode(ip);
+ gfs2_trans_add_data(ip->i_gl, bh);
/* If we need to write to the next block as well */
if (to_write > (bsize - boff)) {
@@ -763,7 +781,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
boff = pg_off % bsize;
continue;
}
- done = 1;
+ break;
}
/* Write to the page, now that we have setup the buffer(s) */
@@ -780,12 +798,12 @@ unlock_out:
return -EIO;
}
-static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
+static int gfs2_write_disk_quota(struct gfs2_sbd *sdp, struct gfs2_quota *qp,
loff_t loc)
{
unsigned long pg_beg;
unsigned pg_off, nbytes, overflow = 0;
- int pg_oflow = 0, error;
+ int error;
void *ptr;
nbytes = sizeof(struct gfs2_quota);
@@ -794,17 +812,15 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
pg_off = offset_in_page(loc);
/* If the quota straddles a page boundary, split the write in two */
- if ((pg_off + nbytes) > PAGE_SIZE) {
- pg_oflow = 1;
+ if ((pg_off + nbytes) > PAGE_SIZE)
overflow = (pg_off + nbytes) - PAGE_SIZE;
- }
ptr = qp;
- error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr,
+ error = gfs2_write_buf_to_page(sdp, pg_beg, pg_off, ptr,
nbytes - overflow);
/* If there's an overflow, write the remaining bytes to the next page */
- if (!error && pg_oflow)
- error = gfs2_write_buf_to_page(ip, pg_beg + 1, 0,
+ if (!error && overflow)
+ error = gfs2_write_buf_to_page(sdp, pg_beg + 1, 0,
ptr + nbytes - overflow,
overflow);
return error;
@@ -812,7 +828,7 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
/**
* gfs2_adjust_quota - adjust record of current block usage
- * @ip: The quota inode
+ * @sdp: The superblock
* @loc: Offset of the entry in the quota file
* @change: The amount of usage change to record
* @qd: The quota data
@@ -824,12 +840,12 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
* Returns: 0 or -ve on error
*/
-static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
+static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc,
s64 change, struct gfs2_quota_data *qd,
struct qc_dqblk *fdq)
{
+ struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct inode *inode = &ip->i_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_quota q;
int err;
u64 size;
@@ -846,7 +862,6 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
return err;
loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */
- err = -EIO;
be64_add_cpu(&q.qu_value, change);
if (((s64)be64_to_cpu(q.qu_value)) < 0)
q.qu_value = 0; /* Never go negative on quota usage */
@@ -866,12 +881,12 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
}
}
- err = gfs2_write_disk_quota(ip, &q, loc);
+ err = gfs2_write_disk_quota(sdp, &q, loc);
if (!err) {
size = loc + sizeof(struct gfs2_quota);
if (size > inode->i_size)
i_size_write(inode, size);
- inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
set_bit(QDF_REFRESH, &qd->qd_flags);
}
@@ -881,7 +896,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
{
- struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = (*qda)->qd_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks, ind_blocks;
@@ -893,18 +908,12 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
unsigned int nalloc = 0, blocks;
int error;
- error = gfs2_qa_get(ip);
- if (error)
- return error;
-
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
&data_blocks, &ind_blocks);
ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
- if (!ghs) {
- error = -ENOMEM;
- goto out;
- }
+ if (!ghs)
+ return -ENOMEM;
sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
inode_lock(&ip->i_inode);
@@ -953,7 +962,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
for (x = 0; x < num_qd; x++) {
qd = qda[x];
offset = qd2offset(qd);
- error = gfs2_adjust_quota(ip, offset, qd->qd_change_sync, qd, NULL);
+ error = gfs2_adjust_quota(sdp, offset, qd->qd_change_sync, qd,
+ NULL);
if (error)
goto out_end_trans;
@@ -961,8 +971,6 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
set_bit(QDF_REFRESH, &qd->qd_flags);
}
- error = 0;
-
out_end_trans:
gfs2_trans_end(sdp);
out_ipres:
@@ -976,8 +984,10 @@ out_dq:
kfree(ghs);
gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl,
GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC);
-out:
- gfs2_qa_put(ip);
+ if (!error) {
+ for (x = 0; x < num_qd; x++)
+ qda[x]->qd_sync_gen = sdp->sd_quota_sync_gen;
+ }
return error;
}
@@ -1009,11 +1019,12 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
static int do_glock(struct gfs2_quota_data *qd, int force_refresh,
struct gfs2_holder *q_gh)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
struct gfs2_holder i_gh;
int error;
+ gfs2_assert_warn(sdp, sdp == qd->qd_gl->gl_name.ln_sbd);
restart:
error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh);
if (error)
@@ -1059,9 +1070,10 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qd;
u32 x;
- int error = 0;
+ int error;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
+ sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
return 0;
error = gfs2_quota_hold(ip, uid, gid);
@@ -1089,16 +1101,15 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
return error;
}
-static int need_sync(struct gfs2_quota_data *qd)
+static bool need_sync(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
struct gfs2_tune *gt = &sdp->sd_tune;
s64 value;
unsigned int num, den;
- int do_sync = 1;
if (!qd->qd_qb.qb_limit)
- return 0;
+ return false;
spin_lock(&qd_lock);
value = qd->qd_change;
@@ -1109,26 +1120,26 @@ static int need_sync(struct gfs2_quota_data *qd)
den = gt->gt_quota_scale_den;
spin_unlock(&gt->gt_spin);
- if (value < 0)
- do_sync = 0;
+ if (value <= 0)
+ return false;
else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >=
(s64)be64_to_cpu(qd->qd_qb.qb_limit))
- do_sync = 0;
+ return false;
else {
value *= gfs2_jindex_size(sdp) * num;
value = div_s64(value, den);
value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
- do_sync = 0;
+ return false;
}
- return do_sync;
+ return true;
}
void gfs2_quota_unlock(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_quota_data *qda[4];
+ struct gfs2_quota_data *qda[2 * GFS2_MAXQUOTAS];
unsigned int count = 0;
u32 x;
int found;
@@ -1138,7 +1149,7 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
struct gfs2_quota_data *qd;
- int sync;
+ bool sync;
qd = ip->i_qadata->qa_qd[x];
sync = need_sync(qd);
@@ -1154,15 +1165,8 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
if (!found)
continue;
- gfs2_assert_warn(sdp, qd->qd_change_sync);
- if (bh_get(qd)) {
- clear_bit(QDF_LOCKED, &qd->qd_flags);
- slot_put(qd);
- qd_put(qd);
- continue;
- }
-
- qda[count++] = qd;
+ if (!qd_bh_get_or_undo(sdp, qd))
+ qda[count++] = qd;
}
if (count) {
@@ -1178,12 +1182,13 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
static int print_message(struct gfs2_quota_data *qd, char *type)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_name.ln_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
- fs_info(sdp, "quota %s for %s %u\n",
- type,
- (qd->qd_id.type == USRQUOTA) ? "user" : "group",
- from_kqid(&init_user_ns, qd->qd_id));
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
+ fs_info(sdp, "quota %s for %s %u\n",
+ type,
+ (qd->qd_id.type == USRQUOTA) ? "user" : "group",
+ from_kqid(&init_user_ns, qd->qd_id));
return 0;
}
@@ -1269,7 +1274,8 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
u32 x;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON ||
+ if ((sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
+ sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) ||
gfs2_assert_warn(sdp, change))
return;
if (ip->i_diskflags & GFS2_DIF_SYSTEM)
@@ -1288,6 +1294,24 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
}
}
+static bool qd_changed(struct gfs2_sbd *sdp)
+{
+ struct gfs2_quota_data *qd;
+ bool changed = false;
+
+ spin_lock(&qd_lock);
+ list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+ if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+ !test_bit(QDF_CHANGE, &qd->qd_flags))
+ continue;
+
+ changed = true;
+ break;
+ }
+ spin_unlock(&qd_lock);
+ return changed;
+}
+
int gfs2_quota_sync(struct super_block *sb, int type)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
@@ -1297,6 +1321,9 @@ int gfs2_quota_sync(struct super_block *sb, int type)
unsigned int x;
int error = 0;
+ if (!qd_changed(sdp))
+ return 0;
+
qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
if (!qda)
return -ENOMEM;
@@ -1318,10 +1345,6 @@ int gfs2_quota_sync(struct super_block *sb, int type)
if (num_qd) {
if (!error)
error = do_sync(num_qd, qda);
- if (!error)
- for (x = 0; x < num_qd; x++)
- qda[x]->qd_sync_gen =
- sdp->sd_quota_sync_gen;
for (x = 0; x < num_qd; x++)
qd_unlock(qda[x]);
@@ -1423,7 +1446,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
set_bit(QDF_CHANGE, &qd->qd_flags);
qd->qd_change = qc_change;
qd->qd_slot = slot;
- qd->qd_slot_count = 1;
+ qd->qd_slot_ref = 1;
spin_lock(&qd_lock);
BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
@@ -1455,36 +1478,35 @@ fail:
void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
{
- struct list_head *head = &sdp->sd_quota_list;
struct gfs2_quota_data *qd;
+ LIST_HEAD(dispose);
+ int count;
- spin_lock(&qd_lock);
- while (!list_empty(head)) {
- qd = list_last_entry(head, struct gfs2_quota_data, qd_list);
+ BUG_ON(test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
- list_del(&qd->qd_list);
+ spin_lock(&qd_lock);
+ list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
+ spin_lock(&qd->qd_lockref.lock);
+ if (qd->qd_lockref.count != 0) {
+ spin_unlock(&qd->qd_lockref.lock);
+ continue;
+ }
+ lockref_mark_dead(&qd->qd_lockref);
+ spin_unlock(&qd->qd_lockref.lock);
- /* Also remove if this qd exists in the reclaim list */
list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
- atomic_dec(&sdp->sd_quota_count);
- spin_unlock(&qd_lock);
-
- spin_lock_bucket(qd->qd_hash);
- hlist_bl_del_rcu(&qd->qd_hlist);
- spin_unlock_bucket(qd->qd_hash);
-
- gfs2_assert_warn(sdp, !qd->qd_change);
- gfs2_assert_warn(sdp, !qd->qd_slot_count);
- gfs2_assert_warn(sdp, !qd->qd_bh_count);
-
- gfs2_glock_put(qd->qd_gl);
- call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
-
- spin_lock(&qd_lock);
+ list_add(&qd->qd_lru, &dispose);
}
spin_unlock(&qd_lock);
- gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
+ gfs2_qd_list_dispose(&dispose);
+
+ wait_event_timeout(sdp->sd_kill_wait,
+ (count = atomic_read(&sdp->sd_quota_count)) == 0,
+ HZ * 60);
+
+ if (count != 0)
+ fs_err(sdp, "%d left-over quota data objects\n", count);
kvfree(sdp->sd_quota_bitmap);
sdp->sd_quota_bitmap = NULL;
@@ -1536,12 +1558,11 @@ int gfs2_quotad(void *data)
unsigned long statfs_timeo = 0;
unsigned long quotad_timeo = 0;
unsigned long t = 0;
- DEFINE_WAIT(wait);
while (!kthread_should_stop()) {
-
if (gfs2_withdrawn(sdp))
- goto bypass;
+ break;
+
/* Update the master statfs file */
if (sdp->sd_statfs_force_sync) {
int error = gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -1559,15 +1580,16 @@ int gfs2_quotad(void *data)
try_to_freeze();
-bypass:
t = min(quotad_timeo, statfs_timeo);
- prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
- if (!sdp->sd_statfs_force_sync)
- t -= schedule_timeout(t);
- else
+ t = wait_event_interruptible_timeout(sdp->sd_quota_wait,
+ sdp->sd_statfs_force_sync ||
+ gfs2_withdrawn(sdp) ||
+ kthread_should_stop(),
+ t);
+
+ if (sdp->sd_statfs_force_sync)
t = 0;
- finish_wait(&sdp->sd_quota_wait, &wait);
}
return 0;
@@ -1580,6 +1602,8 @@ static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state)
memset(state, 0, sizeof(*state));
switch (sdp->sd_args.ar_quota) {
+ case GFS2_QUOTA_QUIET:
+ fallthrough;
case GFS2_QUOTA_ON:
state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED;
state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED;
@@ -1726,7 +1750,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
goto out_release;
/* Apply changes */
- error = gfs2_adjust_quota(ip, offset, 0, qd, fdq);
+ error = gfs2_adjust_quota(sdp, offset, 0, qd, fdq);
if (!error)
clear_bit(QDF_QMSG_QUIET, &qd->qd_flags);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 21ada332d555..1429945215a0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -50,7 +50,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
return ret;
- if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON &&
+ sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET)
return 0;
ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap);
if (ret)
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 9c7a9f640bad..5aae02669a40 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -27,7 +27,7 @@
#include "util.h"
#include "dir.h"
-struct workqueue_struct *gfs_recovery_wq;
+struct workqueue_struct *gfs2_recovery_wq;
int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
struct buffer_head **bh)
@@ -570,7 +570,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
return -EBUSY;
/* we have JDF_RECOVERY, queue should always succeed */
- rv = queue_work(gfs_recovery_wq, &jd->jd_work);
+ rv = queue_work(gfs2_recovery_wq, &jd->jd_work);
BUG_ON(!rv);
if (wait)
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 0d30f8e804f4..7a0c9d0b7503 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -9,7 +9,7 @@
#include "incore.h"
-extern struct workqueue_struct *gfs_recovery_wq;
+extern struct workqueue_struct *gfs2_recovery_wq;
static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk)
{
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9f4d5d6549ee..02d93da21b2b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -412,7 +412,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
+ str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec);
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -429,7 +429,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_eattr = cpu_to_be64(ip->i_eattr);
str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
+ str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec);
}
/**
@@ -546,20 +546,10 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp)
{
int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- if (!test_bit(SDF_DEACTIVATING, &sdp->sd_flags))
+ if (!test_bit(SDF_KILL, &sdp->sd_flags))
gfs2_flush_delete_work(sdp);
- if (!log_write_allowed && current == sdp->sd_quotad_process)
- fs_warn(sdp, "The quotad daemon is withdrawing.\n");
- else if (sdp->sd_quotad_process)
- kthread_stop(sdp->sd_quotad_process);
- sdp->sd_quotad_process = NULL;
-
- if (!log_write_allowed && current == sdp->sd_logd_process)
- fs_warn(sdp, "The logd daemon is withdrawing.\n");
- else if (sdp->sd_logd_process)
- kthread_stop(sdp->sd_logd_process);
- sdp->sd_logd_process = NULL;
+ gfs2_destroy_threads(sdp);
if (log_write_allowed) {
gfs2_quota_sync(sdp->sd_vfs, 0);
@@ -580,15 +570,8 @@ void gfs2_make_fs_ro(struct gfs2_sbd *sdp)
gfs2_log_is_empty(sdp),
HZ * 5);
gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp));
- } else {
- wait_event_timeout(sdp->sd_log_waitq,
- gfs2_log_is_empty(sdp),
- HZ * 5);
}
gfs2_quota_cleanup(sdp);
-
- if (!log_write_allowed)
- sdp->sd_vfs->s_flags |= SB_RDONLY;
}
/**
@@ -622,6 +605,10 @@ restart:
if (!sb_rdonly(sb)) {
gfs2_make_fs_ro(sdp);
}
+ if (gfs2_withdrawn(sdp)) {
+ gfs2_destroy_threads(sdp);
+ gfs2_quota_cleanup(sdp);
+ }
WARN_ON(gfs2_withdrawing(sdp));
/* At this point, we're through modifying the disk */
@@ -689,7 +676,7 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
struct super_block *sb = sdp->sd_vfs;
int error;
- error = freeze_super(sb);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
return error;
@@ -697,7 +684,9 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
GFS2_LFC_FREEZE_GO_SYNC);
if (gfs2_withdrawn(sdp)) {
- thaw_super(sb);
+ error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ if (error)
+ return error;
return -EIO;
}
}
@@ -712,7 +701,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp)
error = gfs2_freeze_lock_shared(sdp);
if (error)
goto fail;
- error = thaw_super(sb);
+ error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
if (!error)
return 0;
@@ -761,7 +750,7 @@ out:
*
*/
-static int gfs2_freeze_super(struct super_block *sb)
+static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
@@ -816,7 +805,7 @@ out:
*
*/
-static int gfs2_thaw_super(struct super_block *sb)
+static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
@@ -1132,6 +1121,9 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
case GFS2_QUOTA_ON:
state = "on";
break;
+ case GFS2_QUOTA_QUIET:
+ state = "quiet";
+ break;
default:
state = "unknown";
break;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index bba58629bc45..ab9c83106932 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -36,6 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp);
extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
+extern void gfs2_destroy_threads(struct gfs2_sbd *sdp);
extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
s64 dinodes);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 2dfbe2f188dd..60a0206890c5 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -98,7 +98,10 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf)
"sd_log_flush_head: %d\n"
"sd_log_flush_tail: %d\n"
"sd_log_blks_reserved: %d\n"
- "sd_log_revokes_available: %d\n",
+ "sd_log_revokes_available: %d\n"
+ "sd_log_pinned: %d\n"
+ "sd_log_thresh1: %d\n"
+ "sd_log_thresh2: %d\n",
test_bit(SDF_JOURNAL_CHECKED, &f),
test_bit(SDF_JOURNAL_LIVE, &f),
(sdp->sd_jdesc ? sdp->sd_jdesc->jd_jid : 0),
@@ -118,7 +121,7 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf)
test_bit(SDF_WITHDRAW_IN_PROG, &f),
test_bit(SDF_REMOTE_WITHDRAW, &f),
test_bit(SDF_WITHDRAW_RECOVERY, &f),
- test_bit(SDF_DEACTIVATING, &f),
+ test_bit(SDF_KILL, &f),
sdp->sd_log_error,
rwsem_is_locked(&sdp->sd_log_flush_lock),
sdp->sd_log_num_revoke,
@@ -128,7 +131,10 @@ static ssize_t status_show(struct gfs2_sbd *sdp, char *buf)
sdp->sd_log_flush_head,
sdp->sd_log_flush_tail,
sdp->sd_log_blks_reserved,
- atomic_read(&sdp->sd_log_revokes_available));
+ atomic_read(&sdp->sd_log_revokes_available),
+ atomic_read(&sdp->sd_log_pinned),
+ atomic_read(&sdp->sd_log_thresh1),
+ atomic_read(&sdp->sd_log_thresh2));
return s;
}
@@ -168,10 +174,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
switch (n) {
case 0:
- error = thaw_super(sdp->sd_vfs);
+ error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
break;
case 1:
- error = freeze_super(sdp->sd_vfs);
+ error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
break;
default:
return -EINVAL;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index dac22b1c1a2e..da29fafb6272 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -9,6 +9,7 @@
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
+#include <linux/kthread.h>
#include <linux/crc32.h>
#include <linux/gfs2_ondisk.h>
#include <linux/delay.h>
@@ -150,7 +151,14 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
if (!sb_rdonly(sdp->sd_vfs)) {
bool locked = mutex_trylock(&sdp->sd_freeze_mutex);
- gfs2_make_fs_ro(sdp);
+ wake_up(&sdp->sd_logd_waitq);
+ wake_up(&sdp->sd_quota_wait);
+
+ wait_event_timeout(sdp->sd_log_waitq,
+ gfs2_log_is_empty(sdp),
+ HZ * 5);
+
+ sdp->sd_vfs->s_flags |= SB_RDONLY;
if (locked)
mutex_unlock(&sdp->sd_freeze_mutex);
@@ -315,19 +323,19 @@ int gfs2_withdraw(struct gfs2_sbd *sdp)
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
const struct lm_lockops *lm = ls->ls_ops;
- if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
- test_and_set_bit(SDF_WITHDRAWN, &sdp->sd_flags)) {
- if (!test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags))
- return -1;
-
- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG,
- TASK_UNINTERRUPTIBLE);
- return -1;
- }
-
- set_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
-
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
+ unsigned long old = READ_ONCE(sdp->sd_flags), new;
+
+ do {
+ if (old & BIT(SDF_WITHDRAWN)) {
+ wait_on_bit(&sdp->sd_flags,
+ SDF_WITHDRAW_IN_PROG,
+ TASK_UNINTERRUPTIBLE);
+ return -1;
+ }
+ new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG);
+ } while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new)));
+
fs_err(sdp, "about to withdraw this file system\n");
BUG_ON(sdp->sd_args.ar_debug);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 93b36d026bb4..4fea70c0fe3d 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -311,7 +311,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
ea->ea_num_ptrs = 0;
}
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(sdp);
@@ -763,7 +763,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (error)
goto out_end_trans;
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
out_end_trans:
@@ -888,7 +888,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
if (es->es_el)
ea_set_remove_stuffed(ip, es->es_el);
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
@@ -1106,7 +1106,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
ea->ea_type = GFS2_EATYPE_UNUSED;
}
- ip->i_inode.i_ctime = current_time(&ip->i_inode);
+ inode_set_ctime_current(&ip->i_inode);
__mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC);
gfs2_trans_end(GFS2_SB(&ip->i_inode));
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index d985066006d5..5ea5cd8ecea9 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -2,6 +2,7 @@
config HFS_FS
tristate "Apple Macintosh file system support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index d365bf0b8c77..632c226a3972 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i
goto err1;
dir->i_size++;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
hfs_find_exit(&fd);
return 0;
@@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str)
}
dir->i_size--;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
res = 0;
out:
@@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
dst_dir->i_size++;
- dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir);
+ dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
mark_inode_dirty(dst_dir);
/* finally remove the old entry */
@@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name,
if (err)
goto out;
src_dir->i_size--;
- src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir);
+ src_dir->i_mtime = inode_set_ctime_current(src_dir);
mark_inode_dirty(src_dir);
type = entry.type;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 3e1e3dcf0b48..b75c26045df4 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -263,7 +263,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
if (res)
return res;
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
hfs_delete_inode(inode);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 441d7fc952e3..ee349b72cfb3 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
HFS_I(inode)->fs_blocks = 0;
@@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_mode |= S_IWUGO;
inode->i_mode &= ~hsb->s_file_umask;
inode->i_mode |= S_IFREG;
- inode->i_ctime = inode->i_atime = inode->i_mtime =
- hfs_m_to_utime(rec->file.MdDat);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
+ hfs_m_to_utime(rec->file.MdDat));
inode->i_op = &hfs_file_inode_operations;
inode->i_fop = &hfs_file_operations;
inode->i_mapping->a_ops = &hfs_aops;
@@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data)
inode->i_size = be16_to_cpu(rec->dir.Val) + 2;
HFS_I(inode)->fs_blocks = 0;
inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask);
- inode->i_ctime = inode->i_atime = inode->i_mtime =
- hfs_m_to_utime(rec->dir.MdDat);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode,
+ hfs_m_to_utime(rec->dir.MdDat));
inode->i_op = &hfs_dir_inode_operations;
inode->i_fop = &hfs_dir_operations;
break;
@@ -654,8 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
truncate_setsize(inode, attr->ia_size);
hfs_file_truncate(inode);
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
setattr_copy(&nop_mnt_idmap, inode, attr);
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 2875961fdc10..dc27d418fbcd 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -28,7 +28,9 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags)
/* fix up inode on a timezone change */
diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest;
if (diff) {
- inode->i_ctime.tv_sec += diff;
+ struct timespec64 ctime = inode_get_ctime(inode);
+
+ inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec);
inode->i_atime.tv_sec += diff;
inode->i_mtime.tv_sec += diff;
HFS_I(inode)->tz_secondswest += diff;
diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index 8034e7827a69..8ce4a33a9ac7 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -2,6 +2,7 @@
config HFSPLUS_FS
tristate "Apple Extended HFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select NLS
select NLS_UTF8
select LEGACY_DIRECT_IO
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 35472cba750e..e71ae2537eaa 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
dir->i_size++;
if (S_ISDIR(inode->i_mode))
hfsplus_subfolders_inc(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
hfs_find_exit(&fd);
@@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str)
dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(dir);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) {
@@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid,
dst_dir->i_size++;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_inc(dst_dir);
- dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir);
+ dst_dir->i_mtime = inode_set_ctime_current(dst_dir);
/* finally remove the old entry */
err = hfsplus_cat_build_key(sb, src_fd.search_key,
@@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid,
src_dir->i_size--;
if (type == HFSPLUS_FOLDER)
hfsplus_subfolders_dec(src_dir);
- src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir);
+ src_dir->i_mtime = inode_set_ctime_current(src_dir);
/* remove old thread entry */
hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 56fb5f1312e7..f5c4b3e31a1c 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -346,7 +346,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
inc_nlink(inode);
hfsplus_instantiate(dst_dentry, inode, cnid);
ihold(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
sbi->file_count++;
hfsplus_mark_mdb_dirty(dst_dir->i_sb);
@@ -405,7 +405,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
hfsplus_delete_inode(inode);
} else
sbi->file_count--;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
out:
mutex_unlock(&sbi->vh_mutex);
@@ -426,7 +426,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
if (res)
goto out;
clear_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
hfsplus_delete_inode(inode);
mark_inode_dirty(inode);
out:
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 7a542f3dbe50..3c572e44f2ad 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -448,9 +448,9 @@ int hfsplus_file_extend(struct inode *inode, bool zeroout)
if (sbi->alloc_file->i_size * 8 <
sbi->total_blocks - sbi->free_blocks + 8) {
/* extend alloc file */
- pr_err("extend alloc file! (%llu,%u,%u)\n",
- sbi->alloc_file->i_size * 8,
- sbi->total_blocks, sbi->free_blocks);
+ pr_err_ratelimited("extend alloc file! (%llu,%u,%u)\n",
+ sbi->alloc_file->i_size * 8,
+ sbi->total_blocks, sbi->free_blocks);
return -ENOSPC;
}
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 7d1a675e037d..c65c8c4b03dd 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap,
}
truncate_setsize(inode, attr->ia_size);
hfsplus_file_truncate(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
}
setattr_copy(&nop_mnt_idmap, inode, attr);
@@ -298,7 +298,7 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
@@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
inode->i_ino = sbi->next_cnid++;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
set_nlink(inode, 1);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
hip = HFSPLUS_I(inode);
INIT_LIST_HEAD(&hip->open_dir_list);
@@ -523,7 +523,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_size = 2 + be32_to_cpu(folder->valence);
inode->i_atime = hfsp_mt2ut(folder->access_date);
inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
- inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
+ inode_set_ctime_to_ts(inode,
+ hfsp_mt2ut(folder->attribute_mod_date));
HFSPLUS_I(inode)->create_date = folder->create_date;
HFSPLUS_I(inode)->fs_blocks = 0;
if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
@@ -564,7 +565,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
}
inode->i_atime = hfsp_mt2ut(file->access_date);
inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
- inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
+ inode_set_ctime_to_ts(inode,
+ hfsp_mt2ut(file->attribute_mod_date));
HFSPLUS_I(inode)->create_date = file->create_date;
} else {
pr_err("bad catalog entry used to create inode\n");
@@ -609,7 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
hfsplus_cat_set_perms(inode, &folder->permissions);
folder->access_date = hfsp_ut2mt(inode->i_atime);
folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
- folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+ folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
folder->valence = cpu_to_be32(inode->i_size - 2);
if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
folder->subfolders =
@@ -644,7 +646,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED);
file->access_date = hfsp_ut2mt(inode->i_atime);
file->content_mod_date = hfsp_ut2mt(inode->i_mtime);
- file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
+ file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode));
hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
}
@@ -700,7 +702,7 @@ int hfsplus_fileattr_set(struct mnt_idmap *idmap,
else
hip->userflags &= ~HFSPLUS_FLG_NODUMP;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 46387090eb76..dc5a5cea5fae 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -517,8 +517,7 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
(struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec };
ino->i_mtime =
(struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec };
- ino->i_ctime =
- (struct timespec64){ st->ctime.tv_sec, st->ctime.tv_nsec };
+ inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec);
ino->i_size = st->size;
ino->i_blocks = st->blocks;
return 0;
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index ec975f466877..ac1e9318e65a 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -2,6 +2,7 @@
config HPFS_FS
tristate "OS/2 HPFS file system support"
depends on BLOCK
+ select BUFFER_HEAD
select FS_IOMAP
help
OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index f32f15669996..f36566d61215 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -277,10 +277,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in
* inode.
*/
- if (!result->i_ctime.tv_sec) {
- if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date))))
- result->i_ctime.tv_sec = 1;
- result->i_ctime.tv_nsec = 0;
+ if (!inode_get_ctime(result).tv_sec) {
+ time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date));
+
+ inode_set_ctime(result, csec ? csec : 1, 0);
result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date));
result->i_mtime.tv_nsec = 0;
result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date));
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index e50e92a42432..479166378bae 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -36,7 +36,7 @@ void hpfs_init_inode(struct inode *i)
hpfs_inode->i_rddir_off = NULL;
hpfs_inode->i_dirty = 0;
- i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0;
+ inode_set_ctime(i, 0, 0);
i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0;
i->i_atime.tv_sec = i->i_atime.tv_nsec = 0;
}
@@ -232,7 +232,7 @@ void hpfs_write_inode_nolock(struct inode *i)
if (de) {
de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size);
hpfs_mark_4buffers_dirty(&qbh);
@@ -242,7 +242,7 @@ void hpfs_write_inode_nolock(struct inode *i)
if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) {
de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec));
de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec));
- de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec));
+ de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec));
de->read_only = !(i->i_mode & 0222);
de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0);
de->file_size = cpu_to_le32(0);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 69fb40b2c99a..f4eb8d6f5989 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -13,10 +13,9 @@ static void hpfs_update_directory_times(struct inode *dir)
{
time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb));
if (t == dir->i_mtime.tv_sec &&
- t == dir->i_ctime.tv_sec)
+ t == inode_get_ctime(dir).tv_sec)
return;
- dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t;
- dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0;
+ dir->i_mtime = inode_set_ctime(dir, t, 0);
hpfs_write_inode_nolock(dir);
}
@@ -59,10 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
hpfs_i(result)->i_dno = dno;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
result->i_mode |= S_IFDIR;
result->i_op = &hpfs_dir_iops;
@@ -167,10 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
result->i_fop = &hpfs_file_ops;
set_nlink(result, 1);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
if (dee.read_only)
result->i_mode &= ~0222;
@@ -250,10 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
hpfs_init_inode(result);
result->i_ino = fno;
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
result->i_uid = current_fsuid();
result->i_gid = current_fsgid();
@@ -326,10 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
result->i_ino = fno;
hpfs_init_inode(result);
hpfs_i(result)->i_parent_dir = dir->i_ino;
- result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date));
- result->i_ctime.tv_nsec = 0;
- result->i_mtime.tv_nsec = 0;
- result->i_atime.tv_nsec = 0;
+ result->i_mtime = result->i_atime =
+ inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0);
hpfs_i(result)->i_ea_size = 0;
result->i_mode = S_IFLNK | 0777;
result->i_uid = current_fsuid();
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 1cb89595b875..758a51564124 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -729,8 +729,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
root->i_atime.tv_nsec = 0;
root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date));
root->i_mtime.tv_nsec = 0;
- root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
- root->i_ctime.tv_nsec = 0;
+ inode_set_ctime(root,
+ local_to_gmt(s, le32_to_cpu(de->creation_date)),
+ 0);
hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size);
hpfs_i(root)->i_parent_dir = root->i_ino;
if (root->i_size == -1)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7b17ccfa039d..316c4cebd3f3 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -283,6 +283,41 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
#endif
/*
+ * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
+ * Returns the maximum number of bytes one can read without touching the 1st raw
+ * HWPOISON subpage.
+ *
+ * The implementation borrows the iteration logic from copy_page_to_iter*.
+ */
+static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
+{
+ size_t n = 0;
+ size_t res = 0;
+
+ /* First subpage to start the loop. */
+ page += offset / PAGE_SIZE;
+ offset %= PAGE_SIZE;
+ while (1) {
+ if (is_raw_hwpoison_page_in_hugepage(page))
+ break;
+
+ /* Safe to read n bytes without touching HWPOISON subpage. */
+ n = min(bytes, (size_t)PAGE_SIZE - offset);
+ res += n;
+ bytes -= n;
+ if (!bytes || !n)
+ break;
+ offset += n;
+ if (offset == PAGE_SIZE) {
+ page++;
+ offset = 0;
+ }
+ }
+
+ return res;
+}
+
+/*
* Support for read() - Find the page attached to f_mapping and copy out the
* data. This provides functionality similar to filemap_read().
*/
@@ -300,7 +335,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
while (iov_iter_count(to)) {
struct page *page;
- size_t nr, copied;
+ size_t nr, copied, want;
/* nr is the maximum number of bytes to copy from this page */
nr = huge_page_size(h);
@@ -328,16 +363,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
- if (PageHWPoison(page)) {
- put_page(page);
- retval = -EIO;
- break;
+ if (!PageHWPoison(page))
+ want = nr;
+ else {
+ /*
+ * Adjust how many bytes safe to read without
+ * touching the 1st raw HWPOISON subpage after
+ * offset.
+ */
+ want = adjust_range_hwpoison(page, offset, nr);
+ if (want == 0) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
}
/*
* We have the page, copy it to user space buffer.
*/
- copied = copy_page_to_iter(page, offset, nr, to);
+ copied = copy_page_to_iter(page, offset, want, to);
put_page(page);
}
offset += copied;
@@ -887,7 +932,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
out:
inode_unlock(inode);
return error;
@@ -935,7 +980,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
inode->i_mode = S_IFDIR | ctx->mode;
inode->i_uid = ctx->uid;
inode->i_gid = ctx->gid;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_op = &hugetlbfs_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* directory inodes start off with i_nlink == 2 (for "." entry) */
@@ -979,7 +1024,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_mapping->private_data = resv_map;
info->seals = F_SEAL_SEAL;
switch (mode & S_IFMT) {
@@ -1022,7 +1067,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
if (!inode)
return -ENOSPC;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
d_instantiate(dentry, inode);
dget(dentry);/* Extra count - pin the dentry in core */
return 0;
@@ -1054,7 +1099,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
if (!inode)
return -ENOSPC;
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
d_tmpfile(file, inode);
return finish_open_simple(file, 0);
}
@@ -1076,7 +1121,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
} else
iput(inode);
}
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
return error;
}
diff --git a/fs/inode.c b/fs/inode.c
index 67611a360031..84bc3c76e5cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -751,16 +751,11 @@ EXPORT_SYMBOL_GPL(evict_inodes);
/**
* invalidate_inodes - attempt to free all inodes on a superblock
* @sb: superblock to operate on
- * @kill_dirty: flag to guide handling of dirty inodes
*
- * Attempts to free all inodes for a given superblock. If there were any
- * busy inodes return a non-zero value, else zero.
- * If @kill_dirty is set, discard dirty inodes too, otherwise treat
- * them as busy.
+ * Attempts to free all inodes (including dirty inodes) for a given superblock.
*/
-int invalidate_inodes(struct super_block *sb, bool kill_dirty)
+void invalidate_inodes(struct super_block *sb)
{
- int busy = 0;
struct inode *inode, *next;
LIST_HEAD(dispose);
@@ -772,14 +767,8 @@ again:
spin_unlock(&inode->i_lock);
continue;
}
- if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
- spin_unlock(&inode->i_lock);
- busy = 1;
- continue;
- }
if (atomic_read(&inode->i_count)) {
spin_unlock(&inode->i_lock);
- busy = 1;
continue;
}
@@ -797,8 +786,6 @@ again:
spin_unlock(&sb->s_inode_list_lock);
dispose_list(&dispose);
-
- return busy;
}
/*
@@ -1850,6 +1837,7 @@ EXPORT_SYMBOL(bmap);
static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
+ struct timespec64 ctime;
if (!(mnt->mnt_flags & MNT_RELATIME))
return 1;
@@ -1861,7 +1849,8 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
/*
* Is ctime younger than or equal to atime? If yes, update atime:
*/
- if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+ ctime = inode_get_ctime(inode);
+ if (timespec64_compare(&ctime, &inode->i_atime) >= 0)
return 1;
/*
@@ -1876,29 +1865,76 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
return 0;
}
-int generic_update_time(struct inode *inode, struct timespec64 *time, int flags)
+/**
+ * inode_update_timestamps - update the timestamps on the inode
+ * @inode: inode to be updated
+ * @flags: S_* flags that needed to be updated
+ *
+ * The update_time function is called when an inode's timestamps need to be
+ * updated for a read or write operation. This function handles updating the
+ * actual timestamps. It's up to the caller to ensure that the inode is marked
+ * dirty appropriately.
+ *
+ * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated,
+ * attempt to update all three of them. S_ATIME updates can be handled
+ * independently of the rest.
+ *
+ * Returns a set of S_* flags indicating which values changed.
+ */
+int inode_update_timestamps(struct inode *inode, int flags)
{
- int dirty_flags = 0;
+ int updated = 0;
+ struct timespec64 now;
+
+ if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
+ struct timespec64 ctime = inode_get_ctime(inode);
- if (flags & (S_ATIME | S_CTIME | S_MTIME)) {
- if (flags & S_ATIME)
- inode->i_atime = *time;
- if (flags & S_CTIME)
- inode->i_ctime = *time;
- if (flags & S_MTIME)
- inode->i_mtime = *time;
-
- if (inode->i_sb->s_flags & SB_LAZYTIME)
- dirty_flags |= I_DIRTY_TIME;
- else
- dirty_flags |= I_DIRTY_SYNC;
+ now = inode_set_ctime_current(inode);
+ if (!timespec64_equal(&now, &ctime))
+ updated |= S_CTIME;
+ if (!timespec64_equal(&now, &inode->i_mtime)) {
+ inode->i_mtime = now;
+ updated |= S_MTIME;
+ }
+ if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
+ updated |= S_VERSION;
+ } else {
+ now = current_time(inode);
}
- if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false))
- dirty_flags |= I_DIRTY_SYNC;
+ if (flags & S_ATIME) {
+ if (!timespec64_equal(&now, &inode->i_atime)) {
+ inode->i_atime = now;
+ updated |= S_ATIME;
+ }
+ }
+ return updated;
+}
+EXPORT_SYMBOL(inode_update_timestamps);
+/**
+ * generic_update_time - update the timestamps on the inode
+ * @inode: inode to be updated
+ * @flags: S_* flags that needed to be updated
+ *
+ * The update_time function is called when an inode's timestamps need to be
+ * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME,
+ * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME
+ * updates can be handled done independently of the rest.
+ *
+ * Returns a S_* mask indicating which fields were updated.
+ */
+int generic_update_time(struct inode *inode, int flags)
+{
+ int updated = inode_update_timestamps(inode, flags);
+ int dirty_flags = 0;
+
+ if (updated & (S_ATIME|S_MTIME|S_CTIME))
+ dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC;
+ if (updated & S_VERSION)
+ dirty_flags |= I_DIRTY_SYNC;
__mark_inode_dirty(inode, dirty_flags);
- return 0;
+ return updated;
}
EXPORT_SYMBOL(generic_update_time);
@@ -1906,11 +1942,12 @@ EXPORT_SYMBOL(generic_update_time);
* This does the actual work of updating an inodes time or version. Must have
* had called mnt_want_write() before calling this.
*/
-int inode_update_time(struct inode *inode, struct timespec64 *time, int flags)
+int inode_update_time(struct inode *inode, int flags)
{
if (inode->i_op->update_time)
- return inode->i_op->update_time(inode, time, flags);
- return generic_update_time(inode, time, flags);
+ return inode->i_op->update_time(inode, flags);
+ generic_update_time(inode, flags);
+ return 0;
}
EXPORT_SYMBOL(inode_update_time);
@@ -1962,7 +1999,6 @@ void touch_atime(const struct path *path)
{
struct vfsmount *mnt = path->mnt;
struct inode *inode = d_inode(path->dentry);
- struct timespec64 now;
if (!atime_needs_update(path, inode))
return;
@@ -1981,8 +2017,7 @@ void touch_atime(const struct path *path)
* We may also fail on filesystems that have the ability to make parts
* of the fs read only, e.g. subvolumes in Btrfs.
*/
- now = current_time(inode);
- inode_update_time(inode, &now, S_ATIME);
+ inode_update_time(inode, S_ATIME);
__mnt_drop_write(mnt);
skip_update:
sb_end_write(inode->i_sb);
@@ -2067,18 +2102,21 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
-static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
+static int inode_needs_update_time(struct inode *inode)
{
int sync_it = 0;
+ struct timespec64 now = current_time(inode);
+ struct timespec64 ctime;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
- if (!timespec64_equal(&inode->i_mtime, now))
+ if (!timespec64_equal(&inode->i_mtime, &now))
sync_it = S_MTIME;
- if (!timespec64_equal(&inode->i_ctime, now))
+ ctime = inode_get_ctime(inode);
+ if (!timespec64_equal(&ctime, &now))
sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
@@ -2087,15 +2125,14 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
return sync_it;
}
-static int __file_update_time(struct file *file, struct timespec64 *now,
- int sync_mode)
+static int __file_update_time(struct file *file, int sync_mode)
{
int ret = 0;
struct inode *inode = file_inode(file);
/* try to update time settings */
if (!__mnt_want_write_file(file)) {
- ret = inode_update_time(inode, now, sync_mode);
+ ret = inode_update_time(inode, sync_mode);
__mnt_drop_write_file(file);
}
@@ -2120,13 +2157,12 @@ int file_update_time(struct file *file)
{
int ret;
struct inode *inode = file_inode(file);
- struct timespec64 now = current_time(inode);
- ret = inode_needs_update_time(inode, &now);
+ ret = inode_needs_update_time(inode);
if (ret <= 0)
return ret;
- return __file_update_time(file, &now, ret);
+ return __file_update_time(file, ret);
}
EXPORT_SYMBOL(file_update_time);
@@ -2149,7 +2185,6 @@ static int file_modified_flags(struct file *file, int flags)
{
int ret;
struct inode *inode = file_inode(file);
- struct timespec64 now = current_time(inode);
/*
* Clear the security bits if the process is not being run by root.
@@ -2162,13 +2197,13 @@ static int file_modified_flags(struct file *file, int flags)
if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0;
- ret = inode_needs_update_time(inode, &now);
+ ret = inode_needs_update_time(inode);
if (ret <= 0)
return ret;
if (flags & IOCB_NOWAIT)
return -EAGAIN;
- return __file_update_time(file, &now, ret);
+ return __file_update_time(file, ret);
}
/**
@@ -2488,17 +2523,27 @@ struct timespec64 current_time(struct inode *inode)
struct timespec64 now;
ktime_get_coarse_real_ts64(&now);
-
- if (unlikely(!inode->i_sb)) {
- WARN(1, "current_time() called with uninitialized super_block in the inode");
- return now;
- }
-
return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);
/**
+ * inode_set_ctime_current - set the ctime to current_time
+ * @inode: inode
+ *
+ * Set the inode->i_ctime to the current value for the inode. Returns
+ * the current value that was assigned to i_ctime.
+ */
+struct timespec64 inode_set_ctime_current(struct inode *inode)
+{
+ struct timespec64 now = current_time(inode);
+
+ inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
+ return now;
+}
+EXPORT_SYMBOL(inode_set_ctime_current);
+
+/**
* in_group_or_capable - check whether caller is CAP_FSETID privileged
* @idmap: idmap of the mount @inode was found from
* @inode: inode to check
diff --git a/fs/internal.h b/fs/internal.h
index f7a3dc111026..d64ae03998cc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -23,16 +23,10 @@ struct mnt_idmap;
*/
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
-
-void emergency_thaw_bdev(struct super_block *sb);
#else
static inline void bdev_cache_init(void)
{
}
-static inline int emergency_thaw_bdev(struct super_block *sb)
-{
- return 0;
-}
#endif /* CONFIG_BLOCK */
/*
@@ -115,7 +109,7 @@ static inline void put_file_access(struct file *file)
* super.c
*/
extern int reconfigure_super(struct fs_context *);
-extern bool trylock_super(struct super_block *sb);
+extern bool super_trylock_shared(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
@@ -201,7 +195,7 @@ void lock_two_inodes(struct inode *inode1, struct inode *inode2,
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
-extern int invalidate_inodes(struct super_block *, bool);
+void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5b2481cd4750..f5fd99d6b0d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
* Returns 0 on success, -errno on error, 1 if this was the last
* extent that will fit in user array.
*/
-#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
-#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
-#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
u64 phys, u64 len, u32 flags)
{
@@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
return 1;
+#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+
if (flags & SET_UNKNOWN_FLAGS)
flags |= FIEMAP_EXTENT_UNKNOWN;
if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
@@ -396,8 +397,8 @@ static int ioctl_fsfreeze(struct file *filp)
/* Freeze */
if (sb->s_op->freeze_super)
- return sb->s_op->freeze_super(sb);
- return freeze_super(sb);
+ return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ return freeze_super(sb, FREEZE_HOLDER_USERSPACE);
}
static int ioctl_fsthaw(struct file *filp)
@@ -409,8 +410,8 @@ static int ioctl_fsthaw(struct file *filp)
/* Thaw */
if (sb->s_op->thaw_super)
- return sb->s_op->thaw_super(sb);
- return thaw_super(sb);
+ return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ return thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}
static int ioctl_file_dedupe_range(struct file *file,
@@ -877,6 +878,9 @@ out:
#ifdef CONFIG_COMPAT
/**
* compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
+ * @file: The file to operate on.
+ * @cmd: The ioctl command number.
+ * @arg: The argument to the ioctl.
*
* This is not normally called as a function, but instead set in struct
* file_operations as
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index aa8967cca1a3..2bc0aa23fde3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -23,65 +23,169 @@
#define IOEND_BATCH_SIZE 4096
+typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length);
/*
- * Structure allocated for each folio when block size < folio size
- * to track sub-folio uptodate status and I/O completions.
+ * Structure allocated for each folio to track per-block uptodate, dirty state
+ * and I/O completions.
*/
-struct iomap_page {
+struct iomap_folio_state {
atomic_t read_bytes_pending;
atomic_t write_bytes_pending;
- spinlock_t uptodate_lock;
- unsigned long uptodate[];
+ spinlock_t state_lock;
+
+ /*
+ * Each block has two bits in this bitmap:
+ * Bits [0..blocks_per_folio) has the uptodate status.
+ * Bits [b_p_f...(2*b_p_f)) has the dirty status.
+ */
+ unsigned long state[];
};
-static inline struct iomap_page *to_iomap_page(struct folio *folio)
+static struct bio_set iomap_ioend_bioset;
+
+static inline bool ifs_is_fully_uptodate(struct folio *folio,
+ struct iomap_folio_state *ifs)
{
- if (folio_test_private(folio))
- return folio_get_private(folio);
- return NULL;
+ struct inode *inode = folio->mapping->host;
+
+ return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
}
-static struct bio_set iomap_ioend_bioset;
+static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
+ unsigned int block)
+{
+ return test_bit(block, ifs->state);
+}
-static struct iomap_page *
-iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
+static void ifs_set_range_uptodate(struct folio *folio,
+ struct iomap_folio_state *ifs, size_t off, size_t len)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct inode *inode = folio->mapping->host;
+ unsigned int first_blk = off >> inode->i_blkbits;
+ unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+ unsigned int nr_blks = last_blk - first_blk + 1;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ bitmap_set(ifs->state, first_blk, nr_blks);
+ if (ifs_is_fully_uptodate(folio, ifs))
+ folio_mark_uptodate(folio);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_uptodate(struct folio *folio, size_t off,
+ size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (ifs)
+ ifs_set_range_uptodate(folio, ifs, off, len);
+ else
+ folio_mark_uptodate(folio);
+}
+
+static inline bool ifs_block_is_dirty(struct folio *folio,
+ struct iomap_folio_state *ifs, int block)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+
+ return test_bit(block + blks_per_folio, ifs->state);
+}
+
+static void ifs_clear_range_dirty(struct folio *folio,
+ struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+ unsigned int first_blk = (off >> inode->i_blkbits);
+ unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+ unsigned int nr_blks = last_blk - first_blk + 1;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (ifs)
+ ifs_clear_range_dirty(folio, ifs, off, len);
+}
+
+static void ifs_set_range_dirty(struct folio *folio,
+ struct iomap_folio_state *ifs, size_t off, size_t len)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+ unsigned int first_blk = (off >> inode->i_blkbits);
+ unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
+ unsigned int nr_blks = last_blk - first_blk + 1;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+}
+
+static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (ifs)
+ ifs_set_range_dirty(folio, ifs, off, len);
+}
+
+static struct iomap_folio_state *ifs_alloc(struct inode *inode,
+ struct folio *folio, unsigned int flags)
+{
+ struct iomap_folio_state *ifs = folio->private;
unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
gfp_t gfp;
- if (iop || nr_blocks <= 1)
- return iop;
+ if (ifs || nr_blocks <= 1)
+ return ifs;
if (flags & IOMAP_NOWAIT)
gfp = GFP_NOWAIT;
else
gfp = GFP_NOFS | __GFP_NOFAIL;
- iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
- gfp);
- if (iop) {
- spin_lock_init(&iop->uptodate_lock);
- if (folio_test_uptodate(folio))
- bitmap_fill(iop->uptodate, nr_blocks);
- folio_attach_private(folio, iop);
- }
- return iop;
+ /*
+ * ifs->state tracks two sets of state flags when the
+ * filesystem block size is smaller than the folio size.
+ * The first state tracks per-block uptodate and the
+ * second tracks per-block dirty state.
+ */
+ ifs = kzalloc(struct_size(ifs, state,
+ BITS_TO_LONGS(2 * nr_blocks)), gfp);
+ if (!ifs)
+ return ifs;
+
+ spin_lock_init(&ifs->state_lock);
+ if (folio_test_uptodate(folio))
+ bitmap_set(ifs->state, 0, nr_blocks);
+ if (folio_test_dirty(folio))
+ bitmap_set(ifs->state, nr_blocks, nr_blocks);
+ folio_attach_private(folio, ifs);
+
+ return ifs;
}
-static void iomap_page_release(struct folio *folio)
+static void ifs_free(struct folio *folio)
{
- struct iomap_page *iop = folio_detach_private(folio);
- struct inode *inode = folio->mapping->host;
- unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
+ struct iomap_folio_state *ifs = folio_detach_private(folio);
- if (!iop)
+ if (!ifs)
return;
- WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
- WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
- WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
+ WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending));
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending));
+ WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) !=
folio_test_uptodate(folio));
- kfree(iop);
+ kfree(ifs);
}
/*
@@ -90,7 +194,7 @@ static void iomap_page_release(struct folio *folio)
static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
loff_t *pos, loff_t length, size_t *offp, size_t *lenp)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
loff_t orig_pos = *pos;
loff_t isize = i_size_read(inode);
unsigned block_bits = inode->i_blkbits;
@@ -105,12 +209,12 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* per-block uptodate status and adjust the offset and length if needed
* to avoid reading in already uptodate ranges.
*/
- if (iop) {
+ if (ifs) {
unsigned int i;
/* move forward for each leading block marked uptodate */
for (i = first; i <= last; i++) {
- if (!test_bit(i, iop->uptodate))
+ if (!ifs_block_is_uptodate(ifs, i))
break;
*pos += block_size;
poff += block_size;
@@ -120,7 +224,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
/* truncate len if we find any trailing uptodate block(s) */
for ( ; i <= last; i++) {
- if (test_bit(i, iop->uptodate)) {
+ if (ifs_block_is_uptodate(ifs, i)) {
plen -= (last - i + 1) * block_size;
last = i - 1;
break;
@@ -144,43 +248,19 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
-static void iomap_iop_set_range_uptodate(struct folio *folio,
- struct iomap_page *iop, size_t off, size_t len)
-{
- struct inode *inode = folio->mapping->host;
- unsigned first = off >> inode->i_blkbits;
- unsigned last = (off + len - 1) >> inode->i_blkbits;
- unsigned long flags;
-
- spin_lock_irqsave(&iop->uptodate_lock, flags);
- bitmap_set(iop->uptodate, first, last - first + 1);
- if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio)))
- folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&iop->uptodate_lock, flags);
-}
-
-static void iomap_set_range_uptodate(struct folio *folio,
- struct iomap_page *iop, size_t off, size_t len)
-{
- if (iop)
- iomap_iop_set_range_uptodate(folio, iop, off, len);
- else
- folio_mark_uptodate(folio);
-}
-
static void iomap_finish_folio_read(struct folio *folio, size_t offset,
size_t len, int error)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
if (unlikely(error)) {
folio_clear_uptodate(folio);
folio_set_error(folio);
} else {
- iomap_set_range_uptodate(folio, iop, offset, len);
+ iomap_set_range_uptodate(folio, offset, len);
}
- if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending))
+ if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending))
folio_unlock(folio);
}
@@ -213,7 +293,6 @@ struct iomap_readpage_ctx {
static int iomap_read_inline_data(const struct iomap_iter *iter,
struct folio *folio)
{
- struct iomap_page *iop;
const struct iomap *iomap = iomap_iter_srcmap(iter);
size_t size = i_size_read(iter->inode) - iomap->offset;
size_t poff = offset_in_page(iomap->offset);
@@ -231,15 +310,13 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
- iop = iomap_page_create(iter->inode, folio, iter->flags);
- else
- iop = to_iomap_page(folio);
+ ifs_alloc(iter->inode, folio, iter->flags);
addr = kmap_local_folio(folio, offset);
memcpy(addr, iomap->inline_data, size);
memset(addr + size, 0, PAGE_SIZE - poff - size);
kunmap_local(addr);
- iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff);
+ iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff);
return 0;
}
@@ -260,7 +337,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
loff_t pos = iter->pos + offset;
loff_t length = iomap_length(iter) - offset;
struct folio *folio = ctx->cur_folio;
- struct iomap_page *iop;
+ struct iomap_folio_state *ifs;
loff_t orig_pos = pos;
size_t poff, plen;
sector_t sector;
@@ -269,20 +346,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */
- iop = iomap_page_create(iter->inode, folio, iter->flags);
+ ifs = ifs_alloc(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0)
goto done;
if (iomap_block_needs_zeroing(iter, pos)) {
folio_zero_range(folio, poff, plen);
- iomap_set_range_uptodate(folio, iop, poff, plen);
+ iomap_set_range_uptodate(folio, poff, plen);
goto done;
}
ctx->cur_folio_in_bio = true;
- if (iop)
- atomic_add(plen, &iop->read_bytes_pending);
+ if (ifs)
+ atomic_add(plen, &ifs->read_bytes_pending);
sector = iomap_sector(iomap, pos);
if (!ctx->bio ||
@@ -436,11 +513,11 @@ EXPORT_SYMBOL_GPL(iomap_readahead);
*/
bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
unsigned first, last, i;
- if (!iop)
+ if (!ifs)
return false;
/* Caller's range may extend past the end of this folio */
@@ -451,7 +528,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
last = (from + count - 1) >> inode->i_blkbits;
for (i = first; i <= last; i++)
- if (!test_bit(i, iop->uptodate))
+ if (!ifs_block_is_uptodate(ifs, i))
return false;
return true;
}
@@ -461,16 +538,18 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
* iomap_get_folio - get a folio reference for writing
* @iter: iteration structure
* @pos: start offset of write
+ * @len: Suggested size of folio to create.
*
* Returns a locked reference to the folio at @pos, or an error pointer if the
* folio could not be obtained.
*/
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
{
- unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
+ fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
+ fgp |= fgf_set_order(len);
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
@@ -483,14 +562,13 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags)
folio_size(folio));
/*
- * mm accommodates an old ext3 case where clean folios might
- * not have had the dirty bit cleared. Thus, it can send actual
- * dirty folios to ->release_folio() via shrink_active_list();
- * skip those here.
+ * If the folio is dirty, we refuse to release our metadata because
+ * it may be partially dirty. Once we track per-block dirty state,
+ * we can release the metadata if every block is dirty.
*/
- if (folio_test_dirty(folio) || folio_test_writeback(folio))
+ if (folio_test_dirty(folio))
return false;
- iomap_page_release(folio);
+ ifs_free(folio);
return true;
}
EXPORT_SYMBOL_GPL(iomap_release_folio);
@@ -507,16 +585,22 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
if (offset == 0 && len == folio_size(folio)) {
WARN_ON_ONCE(folio_test_writeback(folio));
folio_cancel_dirty(folio);
- iomap_page_release(folio);
- } else if (folio_test_large(folio)) {
- /* Must release the iop so the page can be split */
- WARN_ON_ONCE(!folio_test_uptodate(folio) &&
- folio_test_dirty(folio));
- iomap_page_release(folio);
+ ifs_free(folio);
}
}
EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
+bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ size_t len = folio_size(folio);
+
+ ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, len);
+ return filemap_dirty_folio(mapping, folio);
+}
+EXPORT_SYMBOL_GPL(iomap_dirty_folio);
+
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -547,7 +631,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- struct iomap_page *iop;
+ struct iomap_folio_state *ifs;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
@@ -555,14 +639,25 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
- if (folio_test_uptodate(folio))
+ /*
+ * If the write or zeroing completely overlaps the current folio, then
+ * entire folio will be dirtied so there is no need for
+ * per-block state tracking structures to be attached to this folio.
+ * For the unshare case, we must read in the ondisk contents because we
+ * are not changing pagecache contents.
+ */
+ if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
+ pos + len >= folio_pos(folio) + folio_size(folio))
return 0;
- folio_clear_error(folio);
- iop = iomap_page_create(iter->inode, folio, iter->flags);
- if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
+ ifs = ifs_alloc(iter->inode, folio, iter->flags);
+ if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1)
return -EAGAIN;
+ if (folio_test_uptodate(folio))
+ return 0;
+ folio_clear_error(folio);
+
do {
iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen);
@@ -589,7 +684,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (status)
return status;
}
- iomap_set_range_uptodate(folio, iop, poff, plen);
+ iomap_set_range_uptodate(folio, poff, plen);
} while ((block_start += plen) < block_end);
return 0;
@@ -603,7 +698,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
if (folio_ops && folio_ops->get_folio)
return folio_ops->get_folio(iter, pos, len);
else
- return iomap_get_folio(iter, pos);
+ return iomap_get_folio(iter, pos, len);
}
static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
@@ -696,7 +791,6 @@ out_unlock:
static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
size_t copied, struct folio *folio)
{
- struct iomap_page *iop = to_iomap_page(folio);
flush_dcache_folio(folio);
/*
@@ -712,7 +806,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
*/
if (unlikely(copied < len && !folio_test_uptodate(folio)))
return 0;
- iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len);
+ iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len);
+ iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied);
filemap_dirty_folio(inode->i_mapping, folio);
return copied;
}
@@ -773,6 +868,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
{
loff_t length = iomap_length(iter);
+ size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
loff_t pos = iter->pos;
ssize_t written = 0;
long status = 0;
@@ -781,15 +877,14 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
do {
struct folio *folio;
- struct page *page;
- unsigned long offset; /* Offset into pagecache page */
- unsigned long bytes; /* Bytes to write to page */
+ size_t offset; /* Offset into folio */
+ size_t bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- offset = offset_in_page(pos);
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_count(i));
-again:
+ bytes = iov_iter_count(i);
+retry:
+ offset = pos & (chunk - 1);
+ bytes = min(chunk - offset, bytes);
status = balance_dirty_pages_ratelimited_flags(mapping,
bdp_flags);
if (unlikely(status))
@@ -819,12 +914,14 @@ again:
if (iter->iomap.flags & IOMAP_F_STALE)
break;
- page = folio_file_page(folio, pos >> PAGE_SHIFT);
- if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
- copied = copy_page_from_iter_atomic(page, offset, bytes, i);
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+ copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
status = iomap_write_end(iter, pos, bytes, copied, folio);
if (unlikely(copied != status))
@@ -838,13 +935,17 @@ again:
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
- if (copied)
+ if (chunk > PAGE_SIZE)
+ chunk /= 2;
+ if (copied) {
bytes = copied;
- goto again;
+ goto retry;
+ }
+ } else {
+ pos += status;
+ written += status;
+ length -= status;
}
- pos += status;
- written += status;
- length -= status;
} while (iov_iter_count(i) && length);
if (status == -EAGAIN) {
@@ -880,9 +981,79 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+static int iomap_write_delalloc_ifs_punch(struct inode *inode,
+ struct folio *folio, loff_t start_byte, loff_t end_byte,
+ iomap_punch_t punch)
+{
+ unsigned int first_blk, last_blk, i;
+ loff_t last_byte;
+ u8 blkbits = inode->i_blkbits;
+ struct iomap_folio_state *ifs;
+ int ret = 0;
+
+ /*
+ * When we have per-block dirty tracking, there can be
+ * blocks within a folio which are marked uptodate
+ * but not dirty. In that case it is necessary to punch
+ * out such blocks to avoid leaking any delalloc blocks.
+ */
+ ifs = folio->private;
+ if (!ifs)
+ return ret;
+
+ last_byte = min_t(loff_t, end_byte - 1,
+ folio_pos(folio) + folio_size(folio) - 1);
+ first_blk = offset_in_folio(folio, start_byte) >> blkbits;
+ last_blk = offset_in_folio(folio, last_byte) >> blkbits;
+ for (i = first_blk; i <= last_blk; i++) {
+ if (!ifs_block_is_dirty(folio, ifs, i)) {
+ ret = punch(inode, folio_pos(folio) + (i << blkbits),
+ 1 << blkbits);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+
+static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
+ loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
+ iomap_punch_t punch)
+{
+ int ret = 0;
+
+ if (!folio_test_dirty(folio))
+ return ret;
+
+ /* if dirty, punch up to offset */
+ if (start_byte > *punch_start_byte) {
+ ret = punch(inode, *punch_start_byte,
+ start_byte - *punch_start_byte);
+ if (ret)
+ return ret;
+ }
+
+ /* Punch non-dirty blocks within folio */
+ ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte,
+ end_byte, punch);
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure the next punch start is correctly bound to
+ * the end of this data range, not the end of the folio.
+ */
+ *punch_start_byte = min_t(loff_t, end_byte,
+ folio_pos(folio) + folio_size(folio));
+
+ return ret;
+}
+
/*
* Scan the data range passed to us for dirty page cache folios. If we find a
- * dirty folio, punch out the preceeding range and update the offset from which
+ * dirty folio, punch out the preceding range and update the offset from which
* the next punch will start from.
*
* We can punch out storage reservations under clean pages because they either
@@ -899,10 +1070,11 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
*/
static int iomap_write_delalloc_scan(struct inode *inode,
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
- int (*punch)(struct inode *inode, loff_t offset, loff_t length))
+ iomap_punch_t punch)
{
while (start_byte < end_byte) {
struct folio *folio;
+ int ret;
/* grab locked page */
folio = filemap_lock_folio(inode->i_mapping,
@@ -913,26 +1085,12 @@ static int iomap_write_delalloc_scan(struct inode *inode,
continue;
}
- /* if dirty, punch up to offset */
- if (folio_test_dirty(folio)) {
- if (start_byte > *punch_start_byte) {
- int error;
-
- error = punch(inode, *punch_start_byte,
- start_byte - *punch_start_byte);
- if (error) {
- folio_unlock(folio);
- folio_put(folio);
- return error;
- }
- }
-
- /*
- * Make sure the next punch start is correctly bound to
- * the end of this data range, not the end of the folio.
- */
- *punch_start_byte = min_t(loff_t, end_byte,
- folio_next_index(folio) << PAGE_SHIFT);
+ ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte,
+ start_byte, end_byte, punch);
+ if (ret) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
}
/* move offset to start of next folio in range */
@@ -977,8 +1135,7 @@ static int iomap_write_delalloc_scan(struct inode *inode,
* the code to subtle off-by-one bugs....
*/
static int iomap_write_delalloc_release(struct inode *inode,
- loff_t start_byte, loff_t end_byte,
- int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+ loff_t start_byte, loff_t end_byte, iomap_punch_t punch)
{
loff_t punch_start_byte = start_byte;
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
@@ -1071,8 +1228,7 @@ out_unlock:
*/
int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
struct iomap *iomap, loff_t pos, loff_t length,
- ssize_t written,
- int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+ ssize_t written, iomap_punch_t punch)
{
loff_t start_byte;
loff_t end_byte;
@@ -1111,7 +1267,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
loff_t length = iomap_length(iter);
- long status = 0;
loff_t written = 0;
/* don't bother with blocks that are not shared to start with */
@@ -1122,28 +1277,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
return length;
do {
- unsigned long offset = offset_in_page(pos);
- unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
struct folio *folio;
+ int status;
+ size_t offset;
+ size_t bytes = min_t(u64, SIZE_MAX, length);
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
- if (iter->iomap.flags & IOMAP_F_STALE)
+ if (iomap->flags & IOMAP_F_STALE)
break;
- status = iomap_write_end(iter, pos, bytes, bytes, folio);
- if (WARN_ON_ONCE(status == 0))
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+ bytes = folio_size(folio) - offset;
+
+ bytes = iomap_write_end(iter, pos, bytes, bytes, folio);
+ if (WARN_ON_ONCE(bytes == 0))
return -EIO;
cond_resched();
- pos += status;
- written += status;
- length -= status;
+ pos += bytes;
+ written += bytes;
+ length -= bytes;
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- } while (length);
+ } while (length > 0);
return written;
}
@@ -1286,24 +1446,24 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
return VM_FAULT_LOCKED;
out_unlock:
folio_unlock(folio);
- return block_page_mkwrite_return(ret);
+ return vmf_fs_error(ret);
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len, int error)
{
- struct iomap_page *iop = to_iomap_page(folio);
+ struct iomap_folio_state *ifs = folio->private;
if (error) {
folio_set_error(folio);
mapping_set_error(inode->i_mapping, error);
}
- WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
- if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
folio_end_writeback(folio);
}
@@ -1570,7 +1730,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
*/
static void
iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
- struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
+ struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct list_head *iolist)
{
sector_t sector = iomap_sector(&wpc->iomap, pos);
@@ -1588,8 +1748,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
}
- if (iop)
- atomic_add(len, &iop->write_bytes_pending);
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len);
}
@@ -1615,7 +1775,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos)
{
- struct iomap_page *iop = iomap_page_create(inode, folio, 0);
+ struct iomap_folio_state *ifs = folio->private;
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio);
@@ -1623,7 +1783,14 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
int error = 0, count = 0, i;
LIST_HEAD(submit_list);
- WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
+ WARN_ON_ONCE(end_pos <= pos);
+
+ if (!ifs && nblocks > 1) {
+ ifs = ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, end_pos - pos);
+ }
+
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
/*
* Walk through the folio to find areas to write back. If we
@@ -1631,7 +1798,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* invalid, grab a new one.
*/
for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
- if (iop && !test_bit(i, iop->uptodate))
+ if (ifs && !ifs_block_is_dirty(folio, ifs, i))
continue;
error = wpc->ops->map_blocks(wpc, inode, pos);
@@ -1642,7 +1809,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
continue;
if (wpc->iomap.type == IOMAP_HOLE)
continue;
- iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc,
+ iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
&submit_list);
count++;
}
@@ -1675,6 +1842,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
}
}
+ /*
+ * We can have dirty bits set past end of file in page_mkwrite path
+ * while mapping the last partial folio. Hence it's better to clear
+ * all the dirty bits in the folio here.
+ */
+ iomap_clear_range_dirty(folio, 0, folio_size(folio));
folio_start_writeback(folio);
folio_unlock(folio);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ea3b868c8355..bcd3f8cf5ea4 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -20,10 +20,12 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
-#define IOMAP_DIO_WRITE_FUA (1 << 28)
-#define IOMAP_DIO_NEED_SYNC (1 << 29)
-#define IOMAP_DIO_WRITE (1 << 30)
-#define IOMAP_DIO_DIRTY (1 << 31)
+#define IOMAP_DIO_CALLER_COMP (1U << 26)
+#define IOMAP_DIO_INLINE_COMP (1U << 27)
+#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
+#define IOMAP_DIO_NEED_SYNC (1U << 29)
+#define IOMAP_DIO_WRITE (1U << 30)
+#define IOMAP_DIO_DIRTY (1U << 31)
struct iomap_dio {
struct kiocb *iocb;
@@ -41,7 +43,6 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
+ struct kiocb *iocb = dio->iocb;
+
atomic_inc(&dio->ref);
/* Sync dio can't be polled reliably */
- if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
- bio_set_polled(bio, dio->iocb);
- dio->submit.poll_bio = bio;
+ if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) {
+ bio_set_polled(bio, iocb);
+ WRITE_ONCE(iocb->private, bio);
}
if (dio->dops && dio->dops->submit_io)
@@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+ return iomap_dio_complete(data);
+}
+
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ struct kiocb *iocb = dio->iocb;
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+ if (!atomic_dec_and_test(&dio->ref))
+ goto release_bio;
- if (atomic_dec_and_test(&dio->ref)) {
- if (dio->wait_for_completion) {
- struct task_struct *waiter = dio->submit.waiter;
- WRITE_ONCE(dio->submit.waiter, NULL);
- blk_wake_io_task(waiter);
- } else if (dio->flags & IOMAP_DIO_WRITE) {
- struct inode *inode = file_inode(dio->iocb->ki_filp);
-
- WRITE_ONCE(dio->iocb->private, NULL);
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
- } else {
- WRITE_ONCE(dio->iocb->private, NULL);
- iomap_dio_complete_work(&dio->aio.work);
- }
+ /*
+ * Synchronous dio, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
+ if (dio->wait_for_completion) {
+ struct task_struct *waiter = dio->submit.waiter;
+
+ WRITE_ONCE(dio->submit.waiter, NULL);
+ blk_wake_io_task(waiter);
+ goto release_bio;
+ }
+
+ /*
+ * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
+ */
+ if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ WRITE_ONCE(iocb->private, NULL);
+ iomap_dio_complete_work(&dio->aio.work);
+ goto release_bio;
+ }
+
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
+ * our completion that way to avoid an async punt to a workqueue.
+ */
+ if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /* only polled IO cares about private cleared */
+ iocb->private = dio;
+ iocb->dio_complete = iomap_dio_deferred_complete;
+
+ /*
+ * Invoke ->ki_complete() directly. We've assigned our
+ * dio_complete callback handler, and since the issuer set
+ * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
+ * notice ->dio_complete being set and will defer calling that
+ * handler until it can be done from a safe task context.
+ *
+ * Note that the 'res' being passed in here is not important
+ * for this case. The actual completion value of the request
+ * will be gotten from dio_complete when that is run by the
+ * issuer.
+ */
+ iocb->ki_complete(iocb, 0);
+ goto release_bio;
}
+ /*
+ * Async DIO completion that requires filesystem level completion work
+ * gets punted to a work queue to complete as the operation may require
+ * more IO to be issued to finalise filesystem metadata changes or
+ * guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &dio->aio.work);
+release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
/*
* Figure out the bio's operation flags from the dio request, the
* mapping, and whether or not we want FUA. Note that we can end up
- * clearing the WRITE_FUA flag in the dio request.
+ * clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
@@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
if (use_fua)
opflags |= REQ_FUA;
else
- dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
return opflags;
}
@@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* Use a FUA write if we need datasync semantics, this is a pure
* data IO that doesn't require any metadata updates (including
* after IO completion such as unwritten extent conversion) and
- * the underlying device supports FUA. This allows us to avoid
- * cache flushes on IO completion.
+ * the underlying device either supports FUA or doesn't have
+ * a volatile write cache. This allows us to avoid cache flushes
+ * on IO completion. If we can't use writethrough and need to
+ * sync, disable in-task completions as dio completion will
+ * need to call generic_write_sync() which will do a blocking
+ * fsync / cache flush call.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
+ (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
+ (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
use_fua = true;
+ else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
}
/*
@@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
/*
- * We can only poll for single bio I/Os.
+ * We can only do deferred completion for pure overwrites that
+ * don't require additional IO at completion. This rules out
+ * writes that need zeroing or extent conversion, extend
+ * the file size, or issue journal IO or cache flushes
+ * during completion processing.
*/
if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+
+ /*
+ * The rules for polled IO completions follow the guidelines as the
+ * ones we set for inline and deferred completions. If none of those
+ * are available for this IO, clear the polled flag.
+ */
+ if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
dio->iocb->ki_flags &= ~IOCB_HIPRI;
if (need_zeroout) {
@@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.poll_bio = NULL;
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iov_iter_rw(iter) == READ) {
+ /* reads can always complete inline */
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+
if (iomi.pos >= dio->i_size)
goto out_free_dio;
@@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
+ /*
+ * Flag as supporting deferred completions, if the issuer
+ * groks it. This can avoid a workqueue punt for writes.
+ * We may later clear this flag if we need to do other IO
+ * as part of this IO completion.
+ */
+ if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
+ dio->flags |= IOMAP_DIO_CALLER_COMP;
+
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
@@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags |= IOMAP_DIO_NEED_SYNC;
/*
- * For datasync only writes, we optimistically try
- * using FUA for this IO. Any non-FUA write that
- * occurs will clear this flag, hence we know before
- * completion whether a cache flush is necessary.
+ * For datasync only writes, we optimistically try using
+ * WRITE_THROUGH for this IO. This flag requires either
+ * FUA writes through the device's write cache, or a
+ * normal write to a device without a volatile write
+ * cache. For the former, Any non-FUA write that occurs
+ * will clear this flag, hence we know before completion
+ * whether a cache flush is necessary.
*/
if (!(iocb->ki_flags & IOCB_SYNC))
- dio->flags |= IOMAP_DIO_WRITE_FUA;
+ dio->flags |= IOMAP_DIO_WRITE_THROUGH;
}
/*
@@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomap_dio_set_error(dio, ret);
/*
- * If all the writes we issued were FUA, we don't need to flush the
- * cache on IO completion. Clear the sync flag for this case.
+ * If all the writes we issued were already written through to the
+ * media, we don't need to flush the cache on IO completion. Clear the
+ * sync flag for this case.
*/
- if (dio->flags & IOMAP_DIO_WRITE_FUA)
+ if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->private, dio->submit.poll_bio);
-
/*
* We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three different
diff --git a/fs/isofs/Kconfig b/fs/isofs/Kconfig
index 08ffd37b9bb8..51434f2a471b 100644
--- a/fs/isofs/Kconfig
+++ b/fs/isofs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config ISO9660_FS
tristate "ISO 9660 CDROM file system support"
+ select BUFFER_HEAD
help
This is the standard file system used on CD-ROMs. It was previously
known as "High Sierra File System" and is called "hsfs" on other
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index df9d70588b60..2ee21286ac8f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1422,13 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_ino, de->flags[-high_sierra]);
}
#endif
-
- inode->i_mtime.tv_sec =
- inode->i_atime.tv_sec =
- inode->i_ctime.tv_sec = iso_date(de->date, high_sierra);
- inode->i_mtime.tv_nsec =
- inode->i_atime.tv_nsec =
- inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime = inode->i_atime =
+ inode_set_ctime(inode, iso_date(de->date, high_sierra), 0);
ei->i_first_extent = (isonum_733(de->extent) +
isonum_711(de->ext_attr_length));
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 48f58c6c9e69..348783a70f57 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -421,10 +421,9 @@ repeat:
/* Rock ridge never appears on a High Sierra disk */
cnt = 0;
if (rr->u.TF.flags & TF_CREATE) {
- inode->i_ctime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
if (rr->u.TF.flags & TF_MODIFY) {
inode->i_mtime.tv_sec =
@@ -439,10 +438,9 @@ repeat:
inode->i_atime.tv_nsec = 0;
}
if (rr->u.TF.flags & TF_ATTRIBUTES) {
- inode->i_ctime.tv_sec =
- iso_date(rr->u.TF.times[cnt++].time,
- 0);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode,
+ iso_date(rr->u.TF.times[cnt++].time, 0),
+ 0);
}
break;
case SIG('S', 'L'):
@@ -534,7 +532,7 @@ repeat:
inode->i_size = reloc->i_size;
inode->i_blocks = reloc->i_blocks;
inode->i_atime = reloc->i_atime;
- inode->i_ctime = reloc->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(reloc));
inode->i_mtime = reloc->i_mtime;
iput(reloc);
break;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 9ec91017a7f3..118699fff2f9 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -41,18 +41,6 @@ static inline void __buffer_unlink(struct journal_head *jh)
}
/*
- * Check a checkpoint buffer could be release or not.
- *
- * Requires j_list_lock
- */
-static inline bool __cp_buffer_busy(struct journal_head *jh)
-{
- struct buffer_head *bh = jh2bh(jh);
-
- return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh));
-}
-
-/*
* __jbd2_log_wait_for_space: wait until there is space in the journal.
*
* Called under j-state_lock *only*. It will be unlocked if we have to wait
@@ -349,6 +337,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
/* Checkpoint list management */
+enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP};
+
/*
* journal_shrink_one_cp_list
*
@@ -360,7 +350,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
* Called with j_list_lock held.
*/
static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
- bool destroy, bool *released)
+ enum shrink_type type,
+ bool *released)
{
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
@@ -376,12 +367,15 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
jh = next_jh;
next_jh = jh->b_cpnext;
- if (destroy) {
+ if (type == SHRINK_DESTROY) {
ret = __jbd2_journal_remove_checkpoint(jh);
} else {
ret = jbd2_journal_try_remove_checkpoint(jh);
- if (ret < 0)
- continue;
+ if (ret < 0) {
+ if (type == SHRINK_BUSY_SKIP)
+ continue;
+ break;
+ }
}
nr_freed++;
@@ -445,7 +439,7 @@ again:
tid = transaction->t_tid;
freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list,
- false, &released);
+ SHRINK_BUSY_SKIP, &released);
nr_freed += freed;
(*nr_to_scan) -= min(*nr_to_scan, freed);
if (*nr_to_scan == 0)
@@ -485,19 +479,21 @@ out:
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
{
transaction_t *transaction, *last_transaction, *next_transaction;
+ enum shrink_type type;
bool released;
transaction = journal->j_checkpoint_transactions;
if (!transaction)
return;
+ type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP;
last_transaction = transaction->t_cpprev;
next_transaction = transaction;
do {
transaction = next_transaction;
next_transaction = transaction->t_cpnext;
journal_shrink_one_cp_list(transaction->t_checkpoint_list,
- destroy, &released);
+ type, &released);
/*
* This function only frees up some memory if possible so we
* dont have an obligation to finish processing. Bail out if
@@ -631,6 +627,8 @@ int jbd2_journal_try_remove_checkpoint(struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
+ if (jh->b_transaction)
+ return -EBUSY;
if (!trylock_buffer(bh))
return -EBUSY;
if (buffer_dirty(bh)) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1073259902a6..8d6f934c3d95 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -298,14 +298,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
{
- struct page *page = bh->b_page;
char *addr;
__u32 checksum;
- addr = kmap_atomic(page);
- checksum = crc32_be(crc32_sum,
- (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
- kunmap_atomic(addr);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
+ checksum = crc32_be(crc32_sum, addr, bh->b_size);
+ kunmap_local(addr);
return checksum;
}
@@ -322,7 +320,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
- struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
__be32 seq;
@@ -331,11 +328,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
return;
seq = cpu_to_be32(sequence);
- addr = kmap_atomic(page);
+ addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
- bh->b_size);
- kunmap_atomic(addr);
+ csum32 = jbd2_chksum(j, csum32, addr, bh->b_size);
+ kunmap_local(addr);
if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fbce16fedaa4..30dec2bd2ecc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -115,14 +115,6 @@ void __jbd2_debug(int level, const char *file, const char *func,
#endif
/* Checksumming functions */
-static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
-{
- if (!jbd2_journal_has_csum_v2or3_feature(j))
- return 1;
-
- return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
-}
-
static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
{
__u32 csum;
@@ -341,7 +333,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
- struct page *new_page;
+ struct folio *new_folio;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
journal_t *journal = transaction->t_journal;
@@ -370,14 +362,14 @@ repeat:
*/
if (jh_in->b_frozen_data) {
done_copy_out = 1;
- new_page = virt_to_page(jh_in->b_frozen_data);
- new_offset = offset_in_page(jh_in->b_frozen_data);
+ new_folio = virt_to_folio(jh_in->b_frozen_data);
+ new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
} else {
- new_page = jh2bh(jh_in)->b_page;
- new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ new_folio = jh2bh(jh_in)->b_folio;
+ new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
}
- mapped_data = kmap_atomic(new_page);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
/*
* Fire data frozen trigger if data already wasn't frozen. Do this
* before checking for escaping, as the trigger may modify the magic
@@ -385,18 +377,17 @@ repeat:
* data in the buffer.
*/
if (!done_copy_out)
- jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+ jbd2_buffer_frozen_trigger(jh_in, mapped_data,
jh_in->b_triggers);
/*
* Check for escaping
*/
- if (*((__be32 *)(mapped_data + new_offset)) ==
- cpu_to_be32(JBD2_MAGIC_NUMBER)) {
+ if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
need_copy_out = 1;
do_escape = 1;
}
- kunmap_atomic(mapped_data);
+ kunmap_local(mapped_data);
/*
* Do we need to do a data copy?
@@ -417,12 +408,10 @@ repeat:
}
jh_in->b_frozen_data = tmp;
- mapped_data = kmap_atomic(new_page);
- memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
- kunmap_atomic(mapped_data);
+ memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
- new_page = virt_to_page(tmp);
- new_offset = offset_in_page(tmp);
+ new_folio = virt_to_folio(tmp);
+ new_offset = offset_in_folio(new_folio, tmp);
done_copy_out = 1;
/*
@@ -438,12 +427,12 @@ repeat:
* copying, we can finally do so.
*/
if (do_escape) {
- mapped_data = kmap_atomic(new_page);
- *((unsigned int *)(mapped_data + new_offset)) = 0;
- kunmap_atomic(mapped_data);
+ mapped_data = kmap_local_folio(new_folio, new_offset);
+ *((unsigned int *)mapped_data) = 0;
+ kunmap_local(mapped_data);
}
- set_bh_page(new_bh, new_page, new_offset);
+ folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr;
@@ -1337,6 +1326,189 @@ static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
}
/*
+ * If the journal init or create aborts, we need to mark the journal
+ * superblock as being NULL to prevent the journal destroy from writing
+ * back a bogus superblock.
+ */
+static void journal_fail_superblock(journal_t *journal)
+{
+ struct buffer_head *bh = journal->j_sb_buffer;
+ brelse(bh);
+ journal->j_sb_buffer = NULL;
+}
+
+/*
+ * Check the superblock for a given journal, performing initial
+ * validation of the format.
+ */
+static int journal_check_superblock(journal_t *journal)
+{
+ journal_superblock_t *sb = journal->j_superblock;
+ int num_fc_blks;
+ int err = -EINVAL;
+
+ if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
+ sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
+ printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 &&
+ be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) {
+ printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
+ printk(KERN_WARNING "JBD2: journal file too short\n");
+ return err;
+ }
+
+ if (be32_to_cpu(sb->s_first) == 0 ||
+ be32_to_cpu(sb->s_first) >= journal->j_total_len) {
+ printk(KERN_WARNING
+ "JBD2: Invalid start block of journal: %u\n",
+ be32_to_cpu(sb->s_first));
+ return err;
+ }
+
+ /*
+ * If this is a V2 superblock, then we have to check the
+ * features flags on it.
+ */
+ if (!jbd2_format_support_feature(journal))
+ return 0;
+
+ if ((sb->s_feature_ro_compat &
+ ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
+ (sb->s_feature_incompat &
+ ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
+ printk(KERN_WARNING "JBD2: Unrecognised features on journal\n");
+ return err;
+ }
+
+ num_fc_blks = jbd2_has_feature_fast_commit(journal) ?
+ jbd2_journal_get_num_fc_blks(sb) : 0;
+ if (be32_to_cpu(sb->s_maxlen) < JBD2_MIN_JOURNAL_BLOCKS ||
+ be32_to_cpu(sb->s_maxlen) - JBD2_MIN_JOURNAL_BLOCKS < num_fc_blks) {
+ printk(KERN_ERR "JBD2: journal file too short %u,%d\n",
+ be32_to_cpu(sb->s_maxlen), num_fc_blks);
+ return err;
+ }
+
+ if (jbd2_has_feature_csum2(journal) &&
+ jbd2_has_feature_csum3(journal)) {
+ /* Can't have checksum v2 and v3 at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
+ "at the same time!\n");
+ return err;
+ }
+
+ if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ jbd2_has_feature_checksum(journal)) {
+ /* Can't have checksum v1 and v2 on at the same time! */
+ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
+ "at the same time!\n");
+ return err;
+ }
+
+ /* Load the checksum driver */
+ if (jbd2_journal_has_csum_v2or3_feature(journal)) {
+ if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
+ printk(KERN_ERR "JBD2: Unknown checksum type\n");
+ return err;
+ }
+
+ journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(journal->j_chksum_driver)) {
+ printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
+ err = PTR_ERR(journal->j_chksum_driver);
+ journal->j_chksum_driver = NULL;
+ return err;
+ }
+ /* Check superblock checksum */
+ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+ printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int journal_revoke_records_per_block(journal_t *journal)
+{
+ int record_size;
+ int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
+
+ if (jbd2_has_feature_64bit(journal))
+ record_size = 8;
+ else
+ record_size = 4;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ space -= sizeof(struct jbd2_journal_block_tail);
+ return space / record_size;
+}
+
+/*
+ * Load the on-disk journal superblock and read the key fields into the
+ * journal_t.
+ */
+static int journal_load_superblock(journal_t *journal)
+{
+ int err;
+ struct buffer_head *bh;
+ journal_superblock_t *sb;
+
+ bh = getblk_unmovable(journal->j_dev, journal->j_blk_offset,
+ journal->j_blocksize);
+ if (bh)
+ err = bh_read(bh, 0);
+ if (!bh || err < 0) {
+ pr_err("%s: Cannot read journal superblock\n", __func__);
+ brelse(bh);
+ return -EIO;
+ }
+
+ journal->j_sb_buffer = bh;
+ sb = (journal_superblock_t *)bh->b_data;
+ journal->j_superblock = sb;
+ err = journal_check_superblock(journal);
+ if (err) {
+ journal_fail_superblock(journal);
+ return err;
+ }
+
+ journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
+ journal->j_tail = be32_to_cpu(sb->s_start);
+ journal->j_first = be32_to_cpu(sb->s_first);
+ journal->j_errno = be32_to_cpu(sb->s_errno);
+ journal->j_last = be32_to_cpu(sb->s_maxlen);
+
+ if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
+ journal->j_total_len = be32_to_cpu(sb->s_maxlen);
+ /* Precompute checksum seed for all metadata */
+ if (jbd2_journal_has_csum_v2or3(journal))
+ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ sizeof(sb->s_uuid));
+ journal->j_revoke_records_per_block =
+ journal_revoke_records_per_block(journal);
+
+ if (jbd2_has_feature_fast_commit(journal)) {
+ journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
+ journal->j_last = journal->j_fc_last -
+ jbd2_journal_get_num_fc_blks(sb);
+ journal->j_fc_first = journal->j_last + 1;
+ journal->j_fc_off = 0;
+ }
+
+ return 0;
+}
+
+
+/*
* Management for journal control blocks: functions to create and
* destroy journal_t structures, and to initialise and read existing
* journal blocks from disk. */
@@ -1352,12 +1524,21 @@ static journal_t *journal_init_common(struct block_device *bdev,
static struct lock_class_key jbd2_trans_commit_key;
journal_t *journal;
int err;
- struct buffer_head *bh;
int n;
journal = kzalloc(sizeof(*journal), GFP_KERNEL);
if (!journal)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ journal->j_blocksize = blocksize;
+ journal->j_dev = bdev;
+ journal->j_fs_dev = fs_dev;
+ journal->j_blk_offset = start;
+ journal->j_total_len = len;
+
+ err = journal_load_superblock(journal);
+ if (err)
+ goto err_cleanup;
init_waitqueue_head(&journal->j_wait_transaction_locked);
init_waitqueue_head(&journal->j_wait_done_commit);
@@ -1370,12 +1551,15 @@ static journal_t *journal_init_common(struct block_device *bdev,
mutex_init(&journal->j_checkpoint_mutex);
spin_lock_init(&journal->j_revoke_lock);
spin_lock_init(&journal->j_list_lock);
+ spin_lock_init(&journal->j_history_lock);
rwlock_init(&journal->j_state_lock);
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
atomic_set(&journal->j_reserved_credits, 0);
+ lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
+ &jbd2_trans_commit_key, 0);
/* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT;
@@ -1385,18 +1569,11 @@ static journal_t *journal_init_common(struct block_device *bdev,
if (err)
goto err_cleanup;
- spin_lock_init(&journal->j_history_lock);
-
- lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
- &jbd2_trans_commit_key, 0);
-
- /* journal descriptor can store up to n blocks -bzzz */
- journal->j_blocksize = blocksize;
- journal->j_dev = bdev;
- journal->j_fs_dev = fs_dev;
- journal->j_blk_offset = start;
- journal->j_total_len = len;
- /* We need enough buffers to write out full descriptor block. */
+ /*
+ * journal descriptor can store up to n blocks, we need enough
+ * buffers to write out full descriptor block.
+ */
+ err = -ENOMEM;
n = journal->j_blocksize / jbd2_min_tag_size();
journal->j_wbufsize = n;
journal->j_fc_wbuf = NULL;
@@ -1405,37 +1582,32 @@ static journal_t *journal_init_common(struct block_device *bdev,
if (!journal->j_wbuf)
goto err_cleanup;
- bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
- if (!bh) {
- pr_err("%s: Cannot get buffer for journal superblock\n",
- __func__);
+ err = percpu_counter_init(&journal->j_checkpoint_jh_count, 0,
+ GFP_KERNEL);
+ if (err)
goto err_cleanup;
- }
- journal->j_sb_buffer = bh;
- journal->j_superblock = (journal_superblock_t *)bh->b_data;
journal->j_shrink_transaction = NULL;
journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
journal->j_shrinker.seeks = DEFAULT_SEEKS;
journal->j_shrinker.batch = journal->j_max_transaction_buffers;
-
- if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL))
+ err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
+ MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+ if (err)
goto err_cleanup;
- if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)",
- MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) {
- percpu_counter_destroy(&journal->j_checkpoint_jh_count);
- goto err_cleanup;
- }
return journal;
err_cleanup:
- brelse(journal->j_sb_buffer);
+ percpu_counter_destroy(&journal->j_checkpoint_jh_count);
+ if (journal->j_chksum_driver)
+ crypto_free_shash(journal->j_chksum_driver);
kfree(journal->j_wbuf);
jbd2_journal_destroy_revoke(journal);
+ journal_fail_superblock(journal);
kfree(journal);
- return NULL;
+ return ERR_PTR(err);
}
/* jbd2_journal_init_dev and jbd2_journal_init_inode:
@@ -1468,8 +1640,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev,
journal_t *journal;
journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
- if (!journal)
- return NULL;
+ if (IS_ERR(journal))
+ return ERR_CAST(journal);
snprintf(journal->j_devname, sizeof(journal->j_devname),
"%pg", journal->j_dev);
@@ -1495,11 +1667,9 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
blocknr = 0;
err = bmap(inode, &blocknr);
-
if (err || !blocknr) {
- pr_err("%s: Cannot locate journal superblock\n",
- __func__);
- return NULL;
+ pr_err("%s: Cannot locate journal superblock\n", __func__);
+ return err ? ERR_PTR(err) : ERR_PTR(-EINVAL);
}
jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
@@ -1509,8 +1679,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
inode->i_sb->s_blocksize);
- if (!journal)
- return NULL;
+ if (IS_ERR(journal))
+ return ERR_CAST(journal);
journal->j_inode = inode;
snprintf(journal->j_devname, sizeof(journal->j_devname),
@@ -1522,18 +1692,6 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
}
/*
- * If the journal init or create aborts, we need to mark the journal
- * superblock as being NULL to prevent the journal destroy from writing
- * back a bogus superblock.
- */
-static void journal_fail_superblock(journal_t *journal)
-{
- struct buffer_head *bh = journal->j_sb_buffer;
- brelse(bh);
- journal->j_sb_buffer = NULL;
-}
-
-/*
* Given a journal_t structure, initialise the various fields for
* startup of a new journaling session. We use this both when creating
* a journal, and after recovering an old journal to reset it for
@@ -1889,163 +2047,6 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
}
EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
-static int journal_revoke_records_per_block(journal_t *journal)
-{
- int record_size;
- int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
-
- if (jbd2_has_feature_64bit(journal))
- record_size = 8;
- else
- record_size = 4;
-
- if (jbd2_journal_has_csum_v2or3(journal))
- space -= sizeof(struct jbd2_journal_block_tail);
- return space / record_size;
-}
-
-/*
- * Read the superblock for a given journal, performing initial
- * validation of the format.
- */
-static int journal_get_superblock(journal_t *journal)
-{
- struct buffer_head *bh;
- journal_superblock_t *sb;
- int err;
-
- bh = journal->j_sb_buffer;
-
- J_ASSERT(bh != NULL);
- if (buffer_verified(bh))
- return 0;
-
- err = bh_read(bh, 0);
- if (err < 0) {
- printk(KERN_ERR
- "JBD2: IO error reading journal superblock\n");
- goto out;
- }
-
- sb = journal->j_superblock;
-
- err = -EINVAL;
-
- if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
- sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
- printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 &&
- be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) {
- printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) {
- printk(KERN_WARNING "JBD2: journal file too short\n");
- goto out;
- }
-
- if (be32_to_cpu(sb->s_first) == 0 ||
- be32_to_cpu(sb->s_first) >= journal->j_total_len) {
- printk(KERN_WARNING
- "JBD2: Invalid start block of journal: %u\n",
- be32_to_cpu(sb->s_first));
- goto out;
- }
-
- if (jbd2_has_feature_csum2(journal) &&
- jbd2_has_feature_csum3(journal)) {
- /* Can't have checksum v2 and v3 at the same time! */
- printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
- "at the same time!\n");
- goto out;
- }
-
- if (jbd2_journal_has_csum_v2or3_feature(journal) &&
- jbd2_has_feature_checksum(journal)) {
- /* Can't have checksum v1 and v2 on at the same time! */
- printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
- "at the same time!\n");
- goto out;
- }
-
- if (!jbd2_verify_csum_type(journal, sb)) {
- printk(KERN_ERR "JBD2: Unknown checksum type\n");
- goto out;
- }
-
- /* Load the checksum driver */
- if (jbd2_journal_has_csum_v2or3_feature(journal)) {
- journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
- if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
- err = PTR_ERR(journal->j_chksum_driver);
- journal->j_chksum_driver = NULL;
- goto out;
- }
- /* Check superblock checksum */
- if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
- printk(KERN_ERR "JBD2: journal checksum error\n");
- err = -EFSBADCRC;
- goto out;
- }
- }
- set_buffer_verified(bh);
- return 0;
-
-out:
- journal_fail_superblock(journal);
- return err;
-}
-
-/*
- * Load the on-disk journal superblock and read the key fields into the
- * journal_t.
- */
-
-static int load_superblock(journal_t *journal)
-{
- int err;
- journal_superblock_t *sb;
- int num_fc_blocks;
-
- err = journal_get_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
- journal->j_tail = be32_to_cpu(sb->s_start);
- journal->j_first = be32_to_cpu(sb->s_first);
- journal->j_errno = be32_to_cpu(sb->s_errno);
- journal->j_last = be32_to_cpu(sb->s_maxlen);
-
- if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len)
- journal->j_total_len = be32_to_cpu(sb->s_maxlen);
- /* Precompute checksum seed for all metadata */
- if (jbd2_journal_has_csum_v2or3(journal))
- journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
- sizeof(sb->s_uuid));
- journal->j_revoke_records_per_block =
- journal_revoke_records_per_block(journal);
-
- if (jbd2_has_feature_fast_commit(journal)) {
- journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
- num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
- if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
- journal->j_last = journal->j_fc_last - num_fc_blocks;
- journal->j_fc_first = journal->j_last + 1;
- journal->j_fc_off = 0;
- }
-
- return 0;
-}
-
-
/**
* jbd2_journal_load() - Read journal from disk.
* @journal: Journal to act on.
@@ -2057,28 +2058,7 @@ static int load_superblock(journal_t *journal)
int jbd2_journal_load(journal_t *journal)
{
int err;
- journal_superblock_t *sb;
-
- err = load_superblock(journal);
- if (err)
- return err;
-
- sb = journal->j_superblock;
-
- /*
- * If this is a V2 superblock, then we have to check the
- * features flags on it.
- */
- if (jbd2_format_support_feature(journal)) {
- if ((sb->s_feature_ro_compat &
- ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
- (sb->s_feature_incompat &
- ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
- printk(KERN_WARNING
- "JBD2: Unrecognised features on journal\n");
- return -EINVAL;
- }
- }
+ journal_superblock_t *sb = journal->j_superblock;
/*
* Create a slab for this blocksize
@@ -2089,8 +2069,11 @@ int jbd2_journal_load(journal_t *journal)
/* Let the recovery code check whether it needs to recover any
* data from the journal. */
- if (jbd2_journal_recover(journal))
- goto recovery_error;
+ err = jbd2_journal_recover(journal);
+ if (err) {
+ pr_warn("JBD2: journal recovery failed\n");
+ return err;
+ }
if (journal->j_failed_commit) {
printk(KERN_ERR "JBD2: journal transaction %u on %s "
@@ -2107,15 +2090,14 @@ int jbd2_journal_load(journal_t *journal)
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
* and reset them on disk. */
- if (journal_reset(journal))
- goto recovery_error;
+ err = journal_reset(journal);
+ if (err) {
+ pr_warn("JBD2: journal reset failed\n");
+ return err;
+ }
journal->j_flags |= JBD2_LOADED;
return 0;
-
-recovery_error:
- printk(KERN_WARNING "JBD2: recovery failed\n");
- return -EIO;
}
/**
@@ -2227,8 +2209,6 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
if (!compat && !ro && !incompat)
return 1;
- if (journal_get_superblock(journal))
- return 0;
if (!jbd2_format_support_feature(journal))
return 0;
@@ -2518,16 +2498,12 @@ out:
int jbd2_journal_wipe(journal_t *journal, int write)
{
- int err = 0;
+ int err;
J_ASSERT (!(journal->j_flags & JBD2_LOADED));
- err = load_superblock(journal);
- if (err)
- return err;
-
if (!journal->j_tail)
- goto no_recovery;
+ return 0;
printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
write ? "Clearing" : "Ignoring");
@@ -2540,7 +2516,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
mutex_unlock(&journal->j_checkpoint_mutex);
}
- no_recovery:
return err;
}
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 0184931d47f7..c269a7d29a46 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -230,12 +230,8 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
/* Make sure we wrap around the log correctly! */
#define wrap(journal, var) \
do { \
- unsigned long _wrap_last = \
- jbd2_has_feature_fast_commit(journal) ? \
- (journal)->j_fc_last : (journal)->j_last; \
- \
- if (var >= _wrap_last) \
- var -= (_wrap_last - (journal)->j_first); \
+ if (var >= (journal)->j_last) \
+ var -= ((journal)->j_last - (journal)->j_first); \
} while (0)
static int fc_do_one_pass(journal_t *journal,
@@ -524,9 +520,7 @@ static int do_one_pass(journal_t *journal,
break;
jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
- next_commit_ID, next_log_block,
- jbd2_has_feature_fast_commit(journal) ?
- journal->j_fc_last : journal->j_last);
+ next_commit_ID, next_log_block, journal->j_last);
/* Skip over each chunk of the transaction looking
* either the next descriptor block or the final commit
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4d1fda1f7143..5f08b5fd105a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -935,19 +935,15 @@ static void warn_dirty_buffer(struct buffer_head *bh)
/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
static void jbd2_freeze_jh_data(struct journal_head *jh)
{
- struct page *page;
- int offset;
char *source;
struct buffer_head *bh = jh2bh(jh);
J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n");
- page = bh->b_page;
- offset = offset_in_page(bh->b_data);
- source = kmap_atomic(page);
+ source = kmap_local_folio(bh->b_folio, bh_offset(bh));
/* Fire data frozen trigger just before we copy the data */
- jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers);
- memcpy(jh->b_frozen_data, source + offset, bh->b_size);
- kunmap_atomic(source);
+ jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers);
+ memcpy(jh->b_frozen_data, source, bh->b_size);
+ kunmap_local(source);
/*
* Now that the frozen data is saved off, we need to store any matching
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 5075a0a6d594..091ab0eaabbe 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -204,7 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i,
if (ret)
goto fail;
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(ri->ctime)));
jffs2_free_raw_inode(ri);
@@ -237,7 +238,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
if (dead_f->inocache)
set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink);
if (!ret)
- dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
return ret;
}
/***********************************************************************/
@@ -271,7 +272,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de
set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink);
mutex_unlock(&f->sem);
d_instantiate(dentry, d_inode(old_dentry));
- dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
ihold(d_inode(old_dentry));
}
return ret;
@@ -422,7 +423,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(rd->mctime)));
jffs2_free_raw_dirent(rd);
@@ -566,7 +568,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(rd->mctime)));
inc_nlink(dir_i);
jffs2_free_raw_dirent(rd);
@@ -607,7 +610,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name,
dentry->d_name.len, f, now);
if (!ret) {
- dir_i->i_mtime = dir_i->i_ctime = ITIME(now);
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now));
clear_nlink(d_inode(dentry));
drop_nlink(dir_i);
}
@@ -743,7 +746,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i,
goto fail;
}
- dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
+ dir_i->i_mtime = inode_set_ctime_to_ts(dir_i,
+ ITIME(je32_to_cpu(rd->mctime)));
jffs2_free_raw_dirent(rd);
@@ -864,14 +868,16 @@ static int jffs2_rename (struct mnt_idmap *idmap,
* caller won't do it on its own since we are returning an error.
*/
d_invalidate(new_dentry);
- new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
+ new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i,
+ ITIME(now));
return ret;
}
if (d_is_dir(old_dentry))
drop_nlink(old_dir_i);
- new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now);
+ old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now));
+ new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now));
return 0;
}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 2345ca3f09ee..11c66793960e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -317,7 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
inode->i_size = pos + writtenlen;
inode->i_blocks = (inode->i_size + 511) >> 9;
- inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime));
+ inode->i_mtime = inode_set_ctime_to_ts(inode,
+ ITIME(je32_to_cpu(ri->ctime)));
}
}
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 038516bee1ab..0403efab4089 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -115,7 +115,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size);
ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime));
ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime));
- ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime));
+ ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode)));
ri->offset = cpu_to_je32(0);
ri->csize = ri->dsize = cpu_to_je32(mdatalen);
@@ -148,7 +148,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
}
/* It worked. Update the inode */
inode->i_atime = ITIME(je32_to_cpu(ri->atime));
- inode->i_ctime = ITIME(je32_to_cpu(ri->ctime));
+ inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)));
inode->i_mtime = ITIME(je32_to_cpu(ri->mtime));
inode->i_mode = jemode_to_cpu(ri->mode);
i_uid_write(inode, je16_to_cpu(ri->uid));
@@ -284,7 +284,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino)
inode->i_size = je32_to_cpu(latest_node.isize);
inode->i_atime = ITIME(je32_to_cpu(latest_node.atime));
inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime));
- inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime));
+ inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime)));
set_nlink(inode, f->inocache->pino_nlink);
@@ -388,7 +388,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags)
iattr.ia_gid = inode->i_gid;
iattr.ia_atime = inode->i_atime;
iattr.ia_mtime = inode->i_mtime;
- iattr.ia_ctime = inode->i_ctime;
+ iattr.ia_ctime = inode_get_ctime(inode);
jffs2_do_setattr(inode, &iattr);
}
@@ -475,7 +475,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_mode = jemode_to_cpu(ri->mode);
i_gid_write(inode, je16_to_cpu(ri->gid));
i_uid_write(inode, je16_to_cpu(ri->uid));
- inode->i_atime = inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime));
inode->i_blocks = 0;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 8da19766c101..50727a1ff931 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -35,7 +35,7 @@ struct kvec;
#define ITIME(sec) ((struct timespec64){sec, 0})
#define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds())
#define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec)
-#define JFFS2_F_I_CTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_ctime)
+#define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f)))
#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime)
#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime)
#define sleep_on_spinunlock(wq, s) \
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig
index 51e856f0e4b8..3728cf4d944d 100644
--- a/fs/jfs/Kconfig
+++ b/fs/jfs/Kconfig
@@ -1,7 +1,9 @@
# SPDX-License-Identifier: GPL-2.0-only
config JFS_FS
tristate "JFS filesystem support"
+ select BUFFER_HEAD
select NLS
+ select NLS_UCS2_UTILS
select CRC32
select LEGACY_DIRECT_IO
help
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 7156d2c218c7..b769bbf8bdc2 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -9,7 +9,7 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \
jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \
jfs_extent.o symlink.o jfs_metapage.o \
- jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \
+ jfs_logmgr.o jfs_txnmgr.o \
resize.o xattr.o ioctl.o
jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index fb96f872d207..1de3602c98de 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -116,7 +116,7 @@ int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
if (!rc) {
if (update_mode) {
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
rc = txCommit(tid, 1, &inode, 0);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 8ac10e396050..920d58a1566b 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
break;
}
- ip->i_mtime = ip->i_ctime = current_time(ip);
+ ip->i_mtime = inode_set_ctime_current(ip);
mark_inode_dirty(ip);
txCommit(tid, 1, &ip, 0);
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index ed7989bc2db1..f7bd7e8f5be4 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -96,7 +96,7 @@ int jfs_fileattr_set(struct mnt_idmap *idmap,
jfs_inode->mode2 = flags;
jfs_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return 0;
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index a14a0f18a4c4..88afd108c2dd 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -269,6 +269,7 @@ int dbUnmount(struct inode *ipbmap, int mounterror)
/* free the memory for the in-memory bmap. */
kfree(bmp);
+ JFS_SBI(ipbmap->i_sb)->bmap = NULL;
return (0);
}
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index ae99a7e232ee..63d21822d309 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -166,7 +166,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
/*
* COMMIT_SyncList flags an anonymous tlock on page that is on
* sync list.
- * We need to commit the inode to get the page written disk.
+ * We need to commit the inode to get the page written to the disk.
*/
if (test_and_clear_cflag(COMMIT_Synclist,ip))
jfs_commit_inode(ip, 0);
@@ -311,6 +311,11 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
* blocks in the map. in that case, we'll start off with the
* maximum free.
*/
+
+ /* give up if no space left */
+ if (bmp->db_maxfreebud == -1)
+ return -ENOSPC;
+
max = (s64) 1 << bmp->db_maxfreebud;
if (*nblocks >= max && *nblocks > nbperpage)
nb = nblks = (max > nbperpage) ? max : nbperpage;
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 390cbfce391f..923a58422c46 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -193,6 +193,7 @@ int diUnmount(struct inode *ipimap, int mounterror)
* free in-memory control structure
*/
kfree(imap);
+ JFS_IP(ipimap)->i_imap = NULL;
return (0);
}
@@ -3064,8 +3065,8 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip)
ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec);
ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec);
ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec);
- ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec);
- ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec);
+ inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec),
+ le32_to_cpu(dip->di_ctime.tv_nsec));
ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks));
ip->i_generation = le32_to_cpu(dip->di_gen);
@@ -3139,8 +3140,8 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec);
dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec);
- dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec);
- dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec);
+ dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec);
+ dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec);
dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec);
dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec);
dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 9e1f02767201..87594efa7f7c 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_inode->mode2 |= inode->i_mode;
inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- jfs_inode->otime = inode->i_ctime.tv_sec;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
+ jfs_inode->otime = inode_get_ctime(inode).tv_sec;
inode->i_generation = JFS_SBI(sb)->gengen++;
jfs_inode->cflag = 0;
diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h
index 9db62d047daa..b6a78d4aef1b 100644
--- a/fs/jfs/jfs_unicode.h
+++ b/fs/jfs/jfs_unicode.h
@@ -8,16 +8,9 @@
#include <linux/slab.h>
#include <asm/byteorder.h>
+#include "../nls/nls_ucs2_data.h"
#include "jfs_types.h"
-typedef struct {
- wchar_t start;
- wchar_t end;
- signed char *table;
-} UNICASERANGE;
-
-extern signed char UniUpperTable[512];
-extern UNICASERANGE UniUpperRange[];
extern int get_UCSname(struct component_name *, struct dentry *);
extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *);
@@ -107,12 +100,12 @@ static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2,
*/
static inline wchar_t UniToupper(wchar_t uc)
{
- UNICASERANGE *rp;
+ const struct UniCaseRange *rp;
- if (uc < sizeof(UniUpperTable)) { /* Latin characters */
- return uc + UniUpperTable[uc]; /* Use base tables */
+ if (uc < sizeof(NlsUniUpperTable)) { /* Latin characters */
+ return uc + NlsUniUpperTable[uc]; /* Use base tables */
} else {
- rp = UniUpperRange; /* Use range tables */
+ rp = NlsUniUpperRange; /* Use range tables */
while (rp->start) {
if (uc < rp->start) /* Before start of range */
return uc; /* Uppercase = input */
diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c
deleted file mode 100644
index d0b18c7befb8..000000000000
--- a/fs/jfs/jfs_uniupr.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Copyright (C) International Business Machines Corp., 2000-2002
- */
-
-#include <linux/fs.h>
-#include "jfs_unicode.h"
-
-/*
- * Latin upper case
- */
-signed char UniUpperTable[512] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */
- 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */
- -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */
- -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */
- -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */
- 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */
- -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */
- 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */
- 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */
- 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */
- -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */
- 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */
- -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */
- 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */
-};
-
-/* Upper case range - Greek */
-static signed char UniCaseRangeU03a0[47] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */
- 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */
- -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63,
-};
-
-/* Upper case range - Cyrillic */
-static signed char UniCaseRangeU0430[48] = {
- -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */
- -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */
- 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */
-};
-
-/* Upper case range - Extended cyrillic */
-static signed char UniCaseRangeU0490[61] = {
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */
- 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1,
-};
-
-/* Upper case range - Extended latin and greek */
-static signed char UniCaseRangeU1e00[509] = {
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */
- 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */
- 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */
- 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */
- 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */
- 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */
- 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */
- 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */
- 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */
- 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */
- 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-/* Upper case range - Wide latin */
-static signed char UniCaseRangeUff40[27] = {
- 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */
- -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,
-};
-
-/*
- * Upper Case Range
- */
-UNICASERANGE UniUpperRange[] = {
- { 0x03a0, 0x03ce, UniCaseRangeU03a0 },
- { 0x0430, 0x045f, UniCaseRangeU0430 },
- { 0x0490, 0x04cc, UniCaseRangeU0490 },
- { 0x1e00, 0x1ffc, UniCaseRangeU1e00 },
- { 0xff40, 0xff5a, UniCaseRangeUff40 },
- { 0 }
-};
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index e98ddb2b1cf2..57d7a4300210 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
mark_inode_dirty(dip);
@@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
/* update parent directory inode */
inc_nlink(dip); /* for '..' from child directory */
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
mark_inode_dirty(dip);
rc = txCommit(tid, 2, &iplist[0], 0);
@@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
/* update parent directory's link count corresponding
* to ".." entry of the target directory deleted
*/
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
inode_dec_link_count(dip);
/*
@@ -512,7 +512,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
ASSERT(ip->i_nlink);
- ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip);
+ dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip));
mark_inode_dirty(dip);
/* update target's inode */
@@ -827,8 +827,8 @@ static int jfs_link(struct dentry *old_dentry,
/* update object inode */
inc_nlink(ip); /* for new link */
- ip->i_ctime = current_time(ip);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_set_ctime_current(ip);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
ihold(ip);
@@ -883,7 +883,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
struct component_name dname;
u32 ssize; /* source pathname size */
struct btstack btstack;
- struct inode *ip = d_inode(dentry);
+ struct inode *ip;
s64 xlen = 0;
int bmask = 0, xsize;
s64 xaddr;
@@ -1028,7 +1028,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip,
mark_inode_dirty(ip);
- dip->i_ctime = dip->i_mtime = current_time(dip);
+ dip->i_mtime = inode_set_ctime_current(dip);
mark_inode_dirty(dip);
/*
* commit update of parent directory and link object
@@ -1205,7 +1205,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
tblk->xflag |= COMMIT_DELETE;
tblk->u.ip = new_ip;
} else {
- new_ip->i_ctime = current_time(new_ip);
+ inode_set_ctime_current(new_ip);
mark_inode_dirty(new_ip);
}
} else {
@@ -1268,10 +1268,10 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
/*
* Update ctime on changed/moved inodes & mark dirty
*/
- old_ip->i_ctime = current_time(old_ip);
+ inode_set_ctime_current(old_ip);
mark_inode_dirty(old_ip);
- new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir);
+ new_dir->i_mtime = inode_set_ctime_current(new_dir);
mark_inode_dirty(new_dir);
/* Build list of inodes modified by this transaction */
@@ -1283,7 +1283,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (old_dir != new_dir) {
iplist[ipcount++] = new_dir;
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
mark_inode_dirty(old_dir);
}
@@ -1416,7 +1416,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mark_inode_dirty(ip);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d2f82cb7db1b..2e2f7f6d36a0 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -818,7 +818,7 @@ out:
}
if (inode->i_size < off+len-towrite)
i_size_write(inode, off+len-towrite);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
inode_unlock(inode);
return len - towrite;
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 931e50018f88..8577ad494e05 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -647,7 +647,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
if (old_blocks)
dquot_free_block(inode, old_blocks);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
return 0;
}
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 5d826274570c..c429c42a6867 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -8,16 +8,16 @@
/**
* kernel_read_file() - read file contents into a kernel buffer
*
- * @file file to read from
- * @offset where to start reading from (see below).
- * @buf pointer to a "void *" buffer for reading into (if
+ * @file: file to read from
+ * @offset: where to start reading from (see below).
+ * @buf: pointer to a "void *" buffer for reading into (if
* *@buf is NULL, a buffer will be allocated, and
* @buf_size will be ignored)
- * @buf_size size of buf, if already allocated. If @buf not
+ * @buf_size: size of buf, if already allocated. If @buf not
* allocated, this is the largest size to allocate.
- * @file_size if non-NULL, the full size of @file will be
+ * @file_size: if non-NULL, the full size of @file will be
* written here.
- * @id the kernel_read_file_id identifying the type of
+ * @id: the kernel_read_file_id identifying the type of
* file contents being read (for LSMs to examine)
*
* @offset must be 0 unless both @buf and @file_size are non-NULL
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5a1a4af9d3d2..8b2bd65d70e7 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -383,9 +383,11 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
rb_insert_color(&kn->rb, &kn->parent->dir.children);
/* successfully added, account subdir number */
+ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
kn->parent->dir.subdirs++;
kernfs_inc_rev(kn->parent);
+ up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
return 0;
}
@@ -408,9 +410,11 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn)
if (RB_EMPTY_NODE(&kn->rb))
return false;
+ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
kn->parent->dir.subdirs--;
kernfs_inc_rev(kn->parent);
+ up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
rb_erase(&kn->rb, &kn->parent->dir.children);
RB_CLEAR_NODE(&kn->rb);
@@ -556,7 +560,7 @@ void kernfs_put(struct kernfs_node *kn)
kfree_const(kn->name);
if (kn->iattr) {
- simple_xattrs_free(&kn->iattr->xattrs);
+ simple_xattrs_free(&kn->iattr->xattrs, NULL);
kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
}
spin_lock(&kernfs_idr_lock);
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b22b74d1a115..922719a343a7 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -151,8 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
{
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime =
- inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
static inline void set_inode_attr(struct inode *inode,
@@ -162,7 +161,7 @@ static inline void set_inode_attr(struct inode *inode,
inode->i_gid = attrs->ia_gid;
inode->i_atime = attrs->ia_atime;
inode->i_mtime = attrs->ia_mtime;
- inode->i_ctime = attrs->ia_ctime;
+ inode_set_ctime_to_ts(inode, attrs->ia_ctime);
}
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
@@ -191,7 +190,7 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
down_read(&root->kernfs_iattr_rwsem);
kernfs_refresh_inode(kn, inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
up_read(&root->kernfs_iattr_rwsem);
return 0;
@@ -306,11 +305,17 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
const void *value, size_t size, int flags)
{
+ struct simple_xattr *old_xattr;
struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
- return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
+ old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags);
+ if (IS_ERR(old_xattr))
+ return PTR_ERR(old_xattr);
+
+ simple_xattr_free(old_xattr);
+ return 0;
}
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
@@ -342,7 +347,7 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
- ssize_t removed_size;
+ struct simple_xattr *old_xattr;
int ret;
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
@@ -355,13 +360,18 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
goto dec_size_out;
}
- ret = simple_xattr_set(xattrs, full_name, value, size, flags,
- &removed_size);
-
- if (!ret && removed_size >= 0)
- size = removed_size;
- else if (!ret)
+ old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
+ if (!old_xattr)
return 0;
+
+ if (IS_ERR(old_xattr)) {
+ ret = PTR_ERR(old_xattr);
+ goto dec_size_out;
+ }
+
+ ret = 0;
+ size = old_xattr->size;
+ simple_xattr_free(old_xattr);
dec_size_out:
atomic_sub(size, sz);
dec_count_out:
@@ -376,18 +386,19 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
- ssize_t removed_size;
- int ret;
+ struct simple_xattr *old_xattr;
- ret = simple_xattr_set(xattrs, full_name, value, size, flags,
- &removed_size);
+ old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
+ if (!old_xattr)
+ return 0;
- if (removed_size >= 0) {
- atomic_sub(removed_size, sz);
- atomic_dec(nr);
- }
+ if (IS_ERR(old_xattr))
+ return PTR_ERR(old_xattr);
- return ret;
+ atomic_sub(old_xattr->size, sz);
+ atomic_dec(nr);
+ simple_xattr_free(old_xattr);
+ return 0;
}
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d49606accb07..c4bf26142eec 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -16,6 +16,8 @@
#include <linux/namei.h>
#include <linux/seq_file.h>
#include <linux/exportfs.h>
+#include <linux/uuid.h>
+#include <linux/statfs.h>
#include "kernfs-internal.h"
@@ -45,8 +47,15 @@ static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
return 0;
}
+static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ simple_statfs(dentry, buf);
+ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
+ return 0;
+}
+
const struct super_operations kernfs_sops = {
- .statfs = simple_statfs,
+ .statfs = kernfs_statfs,
.drop_inode = generic_delete_inode,
.evict_inode = kernfs_evict_inode,
@@ -351,6 +360,8 @@ int kernfs_get_tree(struct fs_context *fc)
}
sb->s_flags |= SB_ACTIVE;
+ uuid_gen(&sb->s_uuid);
+
down_write(&root->kernfs_supers_rwsem);
list_add(&info->node, &info->root->supers);
up_write(&root->kernfs_supers_rwsem);
diff --git a/fs/libfs.c b/fs/libfs.c
index 5b851315eeed..37f2d34ee090 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -33,7 +33,7 @@ int simple_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
@@ -239,6 +239,254 @@ const struct inode_operations simple_dir_inode_operations = {
};
EXPORT_SYMBOL(simple_dir_inode_operations);
+static void offset_set(struct dentry *dentry, u32 offset)
+{
+ dentry->d_fsdata = (void *)((uintptr_t)(offset));
+}
+
+static u32 dentry2offset(struct dentry *dentry)
+{
+ return (u32)((uintptr_t)(dentry->d_fsdata));
+}
+
+static struct lock_class_key simple_offset_xa_lock;
+
+/**
+ * simple_offset_init - initialize an offset_ctx
+ * @octx: directory offset map to be initialized
+ *
+ */
+void simple_offset_init(struct offset_ctx *octx)
+{
+ xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
+ lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
+
+ /* 0 is '.', 1 is '..', so always start with offset 2 */
+ octx->next_offset = 2;
+}
+
+/**
+ * simple_offset_add - Add an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: new dentry being added
+ *
+ * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Otherwise, a negative errno value is returned.
+ */
+int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
+{
+ static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
+ u32 offset;
+ int ret;
+
+ if (dentry2offset(dentry) != 0)
+ return -EBUSY;
+
+ ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
+ &octx->next_offset, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+
+ offset_set(dentry, offset);
+ return 0;
+}
+
+/**
+ * simple_offset_remove - Remove an entry to a directory's offset map
+ * @octx: directory offset ctx to be updated
+ * @dentry: dentry being removed
+ *
+ */
+void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
+{
+ u32 offset;
+
+ offset = dentry2offset(dentry);
+ if (offset == 0)
+ return;
+
+ xa_erase(&octx->xa, offset);
+ offset_set(dentry, 0);
+}
+
+/**
+ * simple_offset_rename_exchange - exchange rename with directory offsets
+ * @old_dir: parent of dentry being moved
+ * @old_dentry: dentry being moved
+ * @new_dir: destination parent
+ * @new_dentry: destination dentry
+ *
+ * Returns zero on success. Otherwise a negative errno is returned and the
+ * rename is rolled back.
+ */
+int simple_offset_rename_exchange(struct inode *old_dir,
+ struct dentry *old_dentry,
+ struct inode *new_dir,
+ struct dentry *new_dentry)
+{
+ struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
+ struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
+ u32 old_index = dentry2offset(old_dentry);
+ u32 new_index = dentry2offset(new_dentry);
+ int ret;
+
+ simple_offset_remove(old_ctx, old_dentry);
+ simple_offset_remove(new_ctx, new_dentry);
+
+ ret = simple_offset_add(new_ctx, old_dentry);
+ if (ret)
+ goto out_restore;
+
+ ret = simple_offset_add(old_ctx, new_dentry);
+ if (ret) {
+ simple_offset_remove(new_ctx, old_dentry);
+ goto out_restore;
+ }
+
+ ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ if (ret) {
+ simple_offset_remove(new_ctx, old_dentry);
+ simple_offset_remove(old_ctx, new_dentry);
+ goto out_restore;
+ }
+ return 0;
+
+out_restore:
+ offset_set(old_dentry, old_index);
+ xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+ offset_set(new_dentry, new_index);
+ xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+ return ret;
+}
+
+/**
+ * simple_offset_destroy - Release offset map
+ * @octx: directory offset ctx that is about to be destroyed
+ *
+ * During fs teardown (eg. umount), a directory's offset map might still
+ * contain entries. xa_destroy() cleans out anything that remains.
+ */
+void simple_offset_destroy(struct offset_ctx *octx)
+{
+ xa_destroy(&octx->xa);
+}
+
+/**
+ * offset_dir_llseek - Advance the read position of a directory descriptor
+ * @file: an open directory whose position is to be updated
+ * @offset: a byte offset
+ * @whence: enumerator describing the starting position for this update
+ *
+ * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
+ *
+ * Returns the updated read position if successful; otherwise a
+ * negative errno is returned and the read position remains unchanged.
+ */
+static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ switch (whence) {
+ case SEEK_CUR:
+ offset += file->f_pos;
+ fallthrough;
+ case SEEK_SET:
+ if (offset >= 0)
+ break;
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ return vfs_setpos(file, offset, U32_MAX);
+}
+
+static struct dentry *offset_find_next(struct xa_state *xas)
+{
+ struct dentry *child, *found = NULL;
+
+ rcu_read_lock();
+ child = xas_next_entry(xas, U32_MAX);
+ if (!child)
+ goto out;
+ spin_lock(&child->d_lock);
+ if (simple_positive(child))
+ found = dget_dlock(child);
+ spin_unlock(&child->d_lock);
+out:
+ rcu_read_unlock();
+ return found;
+}
+
+static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
+{
+ u32 offset = dentry2offset(dentry);
+ struct inode *inode = d_inode(dentry);
+
+ return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
+ inode->i_ino, fs_umode_to_dtype(inode->i_mode));
+}
+
+static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
+{
+ struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
+ XA_STATE(xas, &so_ctx->xa, ctx->pos);
+ struct dentry *dentry;
+
+ while (true) {
+ dentry = offset_find_next(&xas);
+ if (!dentry)
+ break;
+
+ if (!offset_dir_emit(ctx, dentry)) {
+ dput(dentry);
+ break;
+ }
+
+ dput(dentry);
+ ctx->pos = xas.xa_index + 1;
+ }
+}
+
+/**
+ * offset_readdir - Emit entries starting at offset @ctx->pos
+ * @file: an open directory to iterate over
+ * @ctx: directory iteration context
+ *
+ * Caller must hold @file's i_rwsem to prevent insertion or removal of
+ * entries during this call.
+ *
+ * On entry, @ctx->pos contains an offset that represents the first entry
+ * to be read from the directory.
+ *
+ * The operation continues until there are no more entries to read, or
+ * until the ctx->actor indicates there is no more space in the caller's
+ * output buffer.
+ *
+ * On return, @ctx->pos contains an offset that will read the next entry
+ * in this directory when offset_readdir() is called again with @ctx.
+ *
+ * Return values:
+ * %0 - Complete
+ */
+static int offset_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dir = file->f_path.dentry;
+
+ lockdep_assert_held(&d_inode(dir)->i_rwsem);
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ offset_iterate_dir(d_inode(dir), ctx);
+ return 0;
+}
+
+const struct file_operations simple_offset_dir_operations = {
+ .llseek = offset_dir_llseek,
+ .iterate_shared = offset_readdir,
+ .read = generic_read_dir,
+ .fsync = noop_fsync,
+};
+
static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
struct dentry *child = NULL;
@@ -275,7 +523,7 @@ void simple_recursive_removal(struct dentry *dentry,
while ((child = find_next_child(this, victim)) == NULL) {
// kill and ascend
// update metadata while it's still locked
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
clear_nlink(inode);
inode_unlock(inode);
victim = this;
@@ -293,8 +541,7 @@ void simple_recursive_removal(struct dentry *dentry,
dput(victim); // unpin it
}
if (victim == dentry) {
- inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (d_is_dir(dentry))
drop_nlink(inode);
inode_unlock(inode);
@@ -335,7 +582,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_atime = root->i_mtime = root->i_ctime = current_time(root);
+ root->i_atime = root->i_mtime = inode_set_ctime_current(root);
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
@@ -391,7 +638,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
inc_nlink(inode);
ihold(inode);
dget(dentry);
@@ -425,7 +673,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
drop_nlink(inode);
dput(dentry);
return 0;
@@ -444,6 +693,31 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL(simple_rmdir);
+/**
+ * simple_rename_timestamp - update the various inode timestamps for rename
+ * @old_dir: old parent directory
+ * @old_dentry: dentry that is being renamed
+ * @new_dir: new parent directory
+ * @new_dentry: target for rename
+ *
+ * POSIX mandates that the old and new parent directories have their ctime and
+ * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
+ * their ctime updated.
+ */
+void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *newino = d_inode(new_dentry);
+
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ if (new_dir != old_dir)
+ new_dir->i_mtime = inode_set_ctime_current(new_dir);
+ inode_set_ctime_current(d_inode(old_dentry));
+ if (newino)
+ inode_set_ctime_current(newino);
+}
+EXPORT_SYMBOL_GPL(simple_rename_timestamp);
+
int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -459,11 +733,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
inc_nlink(old_dir);
}
}
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- d_inode(old_dentry)->i_ctime =
- d_inode(new_dentry)->i_ctime = current_time(old_dir);
-
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
return 0;
}
EXPORT_SYMBOL_GPL(simple_rename_exchange);
@@ -472,7 +742,6 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
struct dentry *new_dentry, unsigned int flags)
{
- struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
@@ -495,9 +764,7 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inc_nlink(new_dir);
}
- old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime =
- new_dir->i_mtime = inode->i_ctime = current_time(old_dir);
-
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
return 0;
}
EXPORT_SYMBOL(simple_rename);
@@ -548,21 +815,20 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
- struct page *page;
- pgoff_t index;
-
- index = pos >> PAGE_SHIFT;
+ struct folio *folio;
- page = grab_cache_page_write_begin(mapping, index);
- if (!page)
- return -ENOMEM;
+ folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- *pagep = page;
+ *pagep = &folio->page;
- if (!PageUptodate(page) && (len != PAGE_SIZE)) {
- unsigned from = pos & (PAGE_SIZE - 1);
+ if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
+ size_t from = offset_in_folio(folio, pos);
- zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
+ folio_zero_segments(folio, 0, from,
+ from + len, folio_size(folio));
}
return 0;
}
@@ -594,17 +860,18 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
- /* zero the stale part of the page if we did a short copy */
- if (!PageUptodate(page)) {
+ /* zero the stale part of the folio if we did a short copy */
+ if (!folio_test_uptodate(folio)) {
if (copied < len) {
- unsigned from = pos & (PAGE_SIZE - 1);
+ size_t from = offset_in_folio(folio, pos);
- zero_user(page, from + copied, len - copied);
+ folio_zero_range(folio, from + copied, len - copied);
}
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
}
/*
* No need to use i_size_read() here, the i_size
@@ -613,9 +880,9 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return copied;
}
@@ -659,7 +926,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
@@ -685,7 +952,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
goto out;
}
inode->i_mode = S_IFREG | files->mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inode->i_fop = files->ops;
inode->i_ino = i;
d_add(dentry, inode);
@@ -1253,7 +1520,7 @@ struct inode *alloc_anon_inode(struct super_block *s)
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);
@@ -1269,7 +1536,7 @@ EXPORT_SYMBOL(alloc_anon_inode);
* All arguments are ignored and it just returns -EINVAL.
*/
int
-simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
+simple_nosetlease(struct file *filp, int arg, struct file_lock **flp,
void **priv)
{
return -EINVAL;
@@ -1315,7 +1582,7 @@ static int empty_dir_getattr(struct mnt_idmap *idmap,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
@@ -1381,16 +1648,6 @@ bool is_empty_dir_inode(struct inode *inode)
}
#if IS_ENABLED(CONFIG_UNICODE)
-/*
- * Determine if the name of a dentry should be casefolded.
- *
- * Return: if names will need casefolding
- */
-static bool needs_casefold(const struct inode *dir)
-{
- return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
-}
-
/**
* generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
* @dentry: dentry whose name we are checking against
@@ -1411,7 +1668,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
char strbuf[DNAME_INLINE_LEN];
int ret;
- if (!dir || !needs_casefold(dir))
+ if (!dir || !IS_CASEFOLDED(dir))
goto fallback;
/*
* If the dentry name is stored in-line, then it may be concurrently
@@ -1453,7 +1710,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
const struct unicode_map *um = sb->s_encoding;
int ret = 0;
- if (!dir || !needs_casefold(dir))
+ if (!dir || !IS_CASEFOLDED(dir))
return 0;
ret = utf8_casefold_hash(um, dentry, str);
@@ -1646,6 +1903,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
* We don't know how much we wrote, so just return the number of
* bytes which were direct-written
*/
+ iocb->ki_pos -= buffered_written;
if (direct_written)
return direct_written;
return err;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1d9488cf0534..87a0f207df0b 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -276,6 +276,9 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
{
struct nsm_handle *new;
+ if (!hostname)
+ return NULL;
+
new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
if (unlikely(new == NULL))
return NULL;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 22d3ff3818f5..6579948070a4 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -45,7 +45,6 @@
#define NLMDBG_FACILITY NLMDBG_SVC
#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
-#define ALLOWED_SIGS (sigmask(SIGKILL))
static struct svc_program nlmsvc_program;
@@ -57,6 +56,12 @@ static unsigned int nlmsvc_users;
static struct svc_serv *nlmsvc_serv;
unsigned long nlmsvc_timeout;
+static void nlmsvc_request_retry(struct timer_list *tl)
+{
+ svc_wake_up(nlmsvc_serv);
+}
+DEFINE_TIMER(nlmsvc_retry, nlmsvc_request_retry);
+
unsigned int lockd_net_id;
/*
@@ -111,26 +116,12 @@ static void set_grace_period(struct net *net)
schedule_delayed_work(&ln->grace_period_end, grace_period);
}
-static void restart_grace(void)
-{
- if (nlmsvc_ops) {
- struct net *net = &init_net;
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- cancel_delayed_work_sync(&ln->grace_period_end);
- locks_end_grace(&ln->lockd_manager);
- nlmsvc_invalidate_all();
- set_grace_period(net);
- }
-}
-
/*
* This is the lockd kernel thread
*/
static int
lockd(void *vrqstp)
{
- int err = 0;
struct svc_rqst *rqstp = vrqstp;
struct net *net = &init_net;
struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -138,9 +129,6 @@ lockd(void *vrqstp)
/* try_to_freeze() is called from svc_recv() */
set_freezable();
- /* Allow SIGKILL to tell lockd to drop all of its locks */
- allow_signal(SIGKILL);
-
dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
/*
@@ -148,33 +136,12 @@ lockd(void *vrqstp)
* NFS mount or NFS daemon has gone away.
*/
while (!kthread_should_stop()) {
- long timeout = MAX_SCHEDULE_TIMEOUT;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-
/* update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nlm_max_connections;
- if (signalled()) {
- flush_signals(current);
- restart_grace();
- continue;
- }
-
- timeout = nlmsvc_retry_blocked();
-
- /*
- * Find a socket with data available and call its
- * recvfrom routine.
- */
- err = svc_recv(rqstp, timeout);
- if (err == -EAGAIN || err == -EINTR)
- continue;
- dprintk("lockd: request from %s\n",
- svc_print_addr(rqstp, buf, sizeof(buf)));
-
- svc_process(rqstp);
+ nlmsvc_retry_blocked();
+ svc_recv(rqstp);
}
- flush_signals(current);
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
@@ -407,6 +374,7 @@ static void lockd_put(void)
#endif
svc_set_num_threads(nlmsvc_serv, NULL, 0);
+ timer_delete_sync(&nlmsvc_retry);
nlmsvc_serv = NULL;
dprintk("lockd_down: service destroyed\n");
}
@@ -538,7 +506,7 @@ static inline int is_callback(u32 proc)
}
-static int lockd_authenticate(struct svc_rqst *rqstp)
+static enum svc_auth_status lockd_authenticate(struct svc_rqst *rqstp)
{
rqstp->rq_client = NULL;
switch (rqstp->rq_authop->flavour) {
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c43ccdf28ed9..43aeba9de55c 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -131,12 +131,14 @@ static void nlmsvc_insert_block(struct nlm_block *block, unsigned long when)
static inline void
nlmsvc_remove_block(struct nlm_block *block)
{
+ spin_lock(&nlm_blocked_lock);
if (!list_empty(&block->b_list)) {
- spin_lock(&nlm_blocked_lock);
list_del_init(&block->b_list);
spin_unlock(&nlm_blocked_lock);
nlmsvc_release_block(block);
+ return;
}
+ spin_unlock(&nlm_blocked_lock);
}
/*
@@ -152,6 +154,7 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
file, lock->fl.fl_pid,
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end, lock->fl.fl_type);
+ spin_lock(&nlm_blocked_lock);
list_for_each_entry(block, &nlm_blocked, b_list) {
fl = &block->b_call->a_args.lock.fl;
dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
@@ -161,9 +164,11 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
nlmdbg_cookie2a(&block->b_call->a_args.cookie));
if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
kref_get(&block->b_count);
+ spin_unlock(&nlm_blocked_lock);
return block;
}
}
+ spin_unlock(&nlm_blocked_lock);
return NULL;
}
@@ -185,16 +190,19 @@ nlmsvc_find_block(struct nlm_cookie *cookie)
{
struct nlm_block *block;
+ spin_lock(&nlm_blocked_lock);
list_for_each_entry(block, &nlm_blocked, b_list) {
if (nlm_cookie_match(&block->b_call->a_args.cookie,cookie))
goto found;
}
+ spin_unlock(&nlm_blocked_lock);
return NULL;
found:
dprintk("nlmsvc_find_block(%s): block=%p\n", nlmdbg_cookie2a(cookie), block);
kref_get(&block->b_count);
+ spin_unlock(&nlm_blocked_lock);
return block;
}
@@ -317,6 +325,7 @@ void nlmsvc_traverse_blocks(struct nlm_host *host,
restart:
mutex_lock(&file->f_mutex);
+ spin_lock(&nlm_blocked_lock);
list_for_each_entry_safe(block, next, &file->f_blocks, b_flist) {
if (!match(block->b_host, host))
continue;
@@ -325,11 +334,13 @@ restart:
if (list_empty(&block->b_list))
continue;
kref_get(&block->b_count);
+ spin_unlock(&nlm_blocked_lock);
mutex_unlock(&file->f_mutex);
nlmsvc_unlink_block(block);
nlmsvc_release_block(block);
goto restart;
}
+ spin_unlock(&nlm_blocked_lock);
mutex_unlock(&file->f_mutex);
}
@@ -1008,7 +1019,7 @@ retry_deferred_block(struct nlm_block *block)
* picks up locks that can be granted, or grant notifications that must
* be retransmitted.
*/
-unsigned long
+void
nlmsvc_retry_blocked(void)
{
unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
@@ -1038,5 +1049,6 @@ nlmsvc_retry_blocked(void)
}
spin_unlock(&nlm_blocked_lock);
- return timeout;
+ if (timeout < MAX_SCHEDULE_TIMEOUT)
+ mod_timer(&nlmsvc_retry, jiffies + timeout);
}
diff --git a/fs/locks.c b/fs/locks.c
index df8b26a42524..76ad05f8070a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -438,7 +438,7 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
fl->fl_end = OFFSET_MAX;
}
-static int assign_type(struct file_lock *fl, long type)
+static int assign_type(struct file_lock *fl, int type)
{
switch (type) {
case F_RDLCK:
@@ -549,7 +549,7 @@ static const struct lock_manager_operations lease_manager_ops = {
/*
* Initialize a lease, use the default lock manager operations
*/
-static int lease_init(struct file *filp, long type, struct file_lock *fl)
+static int lease_init(struct file *filp, int type, struct file_lock *fl)
{
if (assign_type(fl, type) != 0)
return -EINVAL;
@@ -567,7 +567,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
}
/* Allocate a file_lock initialised to this type of lease */
-static struct file_lock *lease_alloc(struct file *filp, long type)
+static struct file_lock *lease_alloc(struct file *filp, int type)
{
struct file_lock *fl = locks_alloc_lock();
int error = -ENOMEM;
@@ -868,6 +868,21 @@ static bool posix_locks_conflict(struct file_lock *caller_fl,
return locks_conflict(caller_fl, sys_fl);
}
+/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
+ * path so checks for additional GETLK-specific things like F_UNLCK.
+ */
+static bool posix_test_locks_conflict(struct file_lock *caller_fl,
+ struct file_lock *sys_fl)
+{
+ /* F_UNLCK checks any locks on the same fd. */
+ if (caller_fl->fl_type == F_UNLCK) {
+ if (!posix_same_owner(caller_fl, sys_fl))
+ return false;
+ return locks_overlap(caller_fl, sys_fl);
+ }
+ return posix_locks_conflict(caller_fl, sys_fl);
+}
+
/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
* checking before calling the locks_conflict().
*/
@@ -901,7 +916,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
retry:
spin_lock(&ctx->flc_lock);
list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
- if (!posix_locks_conflict(fl, cfl))
+ if (!posix_test_locks_conflict(fl, cfl))
continue;
if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
&& (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
@@ -1301,6 +1316,7 @@ retry:
out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
+ trace_posix_lock_inode(inode, request, error);
/*
* Free any unused locks.
*/
@@ -1309,7 +1325,6 @@ retry:
if (new_fl2)
locks_free_lock(new_fl2);
locks_dispose_list(&dispose);
- trace_posix_lock_inode(inode, request, error);
return error;
}
@@ -1666,7 +1681,7 @@ int fcntl_getlease(struct file *filp)
* conflict with the lease we're trying to set.
*/
static int
-check_conflicting_open(struct file *filp, const long arg, int flags)
+check_conflicting_open(struct file *filp, const int arg, int flags)
{
struct inode *inode = file_inode(filp);
int self_wcount = 0, self_rcount = 0;
@@ -1701,7 +1716,7 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
}
static int
-generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
+generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv)
{
struct file_lock *fl, *my_fl = NULL, *lease;
struct inode *inode = file_inode(filp);
@@ -1729,13 +1744,6 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
if (is_deleg && !inode_trylock(inode))
return -EAGAIN;
- if (is_deleg && arg == F_WRLCK) {
- /* Write delegations are not currently supported: */
- inode_unlock(inode);
- WARN_ON_ONCE(1);
- return -EINVAL;
- }
-
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1859,7 +1867,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
* The (input) flp->fl_lmops->lm_break function is required
* by break_lease().
*/
-int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
+int generic_setlease(struct file *filp, int arg, struct file_lock **flp,
void **priv)
{
struct inode *inode = file_inode(filp);
@@ -1906,7 +1914,7 @@ lease_notifier_chain_init(void)
}
static inline void
-setlease_notifier(long arg, struct file_lock *lease)
+setlease_notifier(int arg, struct file_lock *lease)
{
if (arg != F_UNLCK)
srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
@@ -1942,7 +1950,7 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier);
* may be NULL if the lm_setup operation doesn't require it.
*/
int
-vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
+vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv)
{
if (lease)
setlease_notifier(arg, *lease);
@@ -1953,7 +1961,7 @@ vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
}
EXPORT_SYMBOL_GPL(vfs_setlease);
-static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
+static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
{
struct file_lock *fl;
struct fasync_struct *new;
@@ -1988,7 +1996,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
* Note that you also need to call %F_SETSIG to
* receive a signal when the lease is broken.
*/
-int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
+int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
if (arg == F_UNLCK)
return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
@@ -2136,7 +2144,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
* @fl: The file_lock who's fl_pid should be translated
* @ns: The namespace into which the pid should be translated
*
- * Used to tranlate a fl_pid into a namespace virtual pid number
+ * Used to translate a fl_pid into a namespace virtual pid number
*/
static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
{
@@ -2207,7 +2215,8 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
if (fl == NULL)
return -ENOMEM;
error = -EINVAL;
- if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
+ if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
+ && flock->l_type != F_WRLCK)
goto out;
error = flock_to_posix_lock(filp, fl, flock);
@@ -2414,7 +2423,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
return -ENOMEM;
error = -EINVAL;
- if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK)
+ if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
+ && flock->l_type != F_WRLCK)
goto out;
error = flock64_to_posix_lock(filp, fl, flock);
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index de2003974ff0..90ddfad2a75e 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -2,6 +2,7 @@
config MINIX_FS
tristate "Minix file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
Minix is a simple operating system used in many classes about OS's.
The minix file system (method to organize files on a hard disk
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 870207ba23f1..25c08fbfcb9d 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode)
}
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = j;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u));
insert_inode_hash(inode);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index bf9858f76b6a..20f23e6e58ad 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -281,7 +281,7 @@ got_it:
de->inode = inode->i_ino;
}
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
err = minix_handle_dirsync(dir);
out_put:
@@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
else
de->inode = 0;
dir_commit_chunk(page, pos, len);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return minix_handle_dirsync(inode);
}
@@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
else
de->inode = inode->i_ino;
dir_commit_chunk(page, pos, sbi->s_dirsize);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
return minix_handle_dirsync(dir);
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index e9fbb5303a22..df575473c1cc 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -501,10 +501,7 @@ static struct inode *V1_minix_iget(struct inode *inode)
i_gid_write(inode, raw_inode->i_gid);
set_nlink(inode, raw_inode->i_nlinks);
inode->i_size = raw_inode->i_size;
- inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time;
- inode->i_mtime.tv_nsec = 0;
- inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0);
inode->i_blocks = 0;
for (i = 0; i < 9; i++)
minix_inode->u.i1_data[i] = raw_inode->i_zone[i];
@@ -543,10 +540,9 @@ static struct inode *V2_minix_iget(struct inode *inode)
inode->i_size = raw_inode->i_size;
inode->i_mtime.tv_sec = raw_inode->i_mtime;
inode->i_atime.tv_sec = raw_inode->i_atime;
- inode->i_ctime.tv_sec = raw_inode->i_ctime;
+ inode_set_ctime(inode, raw_inode->i_ctime, 0);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_blocks = 0;
for (i = 0; i < 10; i++)
minix_inode->u.i2_data[i] = raw_inode->i_zone[i];
@@ -622,7 +618,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
raw_inode->i_size = inode->i_size;
raw_inode->i_mtime = inode->i_mtime.tv_sec;
raw_inode->i_atime = inode->i_atime.tv_sec;
- raw_inode->i_ctime = inode->i_ctime.tv_sec;
+ raw_inode->i_ctime = inode_get_ctime(inode).tv_sec;
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev);
else for (i = 0; i < 10; i++)
@@ -660,7 +656,7 @@ int minix_getattr(struct mnt_idmap *idmap, const struct path *path,
struct super_block *sb = path->dentry->d_sb;
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (INODE_VERSION(inode) == MINIX_V1)
stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb);
else
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 446148792f41..ce18ae37c29d 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -131,7 +131,7 @@ static inline int splice_branch(struct inode *inode,
/* We are done with atomic stuff, now do the rest of housekeeping */
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
/* had we spliced it onto indirect block? */
if (where->bh)
@@ -350,7 +350,7 @@ do_indirects:
}
first_whole++;
}
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 956d5183828d..114084d5636a 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -98,7 +98,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
return add_nondir(dentry, inode);
@@ -154,7 +154,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry)
if (err)
return err;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
return 0;
}
@@ -218,7 +218,7 @@ static int minix_rename(struct mnt_idmap *idmap,
put_page(new_page);
if (err)
goto out_dir;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
diff --git a/fs/namei.c b/fs/namei.c
index e56ff39a79bc..94565bd7e73f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
}
}
- result->refcnt = 1;
+ atomic_set(&result->refcnt, 1);
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
@@ -249,7 +249,7 @@ getname_kernel(const char * filename)
memcpy((char *)result->name, filename, len);
result->uptr = NULL;
result->aname = NULL;
- result->refcnt = 1;
+ atomic_set(&result->refcnt, 1);
audit_getname(result);
return result;
@@ -261,9 +261,10 @@ void putname(struct filename *name)
if (IS_ERR(name))
return;
- BUG_ON(name->refcnt <= 0);
+ if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
+ return;
- if (--name->refcnt > 0)
+ if (!atomic_dec_and_test(&name->refcnt))
return;
if (name->name != name->iname) {
@@ -643,6 +644,8 @@ static bool nd_alloc_stack(struct nameidata *nd)
/**
* path_connected - Verify that a dentry is below mnt.mnt_root
+ * @mnt: The mountpoint to check.
+ * @dentry: The dentry to check.
*
* Rename can sometimes move a file or directory outside of a bind
* mount, path_connected allows those cases to be detected.
@@ -1083,6 +1086,7 @@ fs_initcall(init_fs_namei_sysctls);
/**
* may_follow_link - Check symlink following for unsafe situations
* @nd: nameidata pathwalk data
+ * @inode: Used for idmapping.
*
* In the case of the sysctl_protected_symlinks sysctl being enabled,
* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
@@ -2890,7 +2894,7 @@ int path_pts(struct path *path)
dput(path->dentry);
path->dentry = parent;
child = d_hash_and_lookup(parent, &this);
- if (!child)
+ if (IS_ERR_OR_NULL(child))
return -ENOENT;
path->dentry = child;
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3404707ddbe7..2cd3ccf4c439 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
xas_for_each(&xas, folio, last_page) {
loff_t pg_end;
bool pg_failed = false;
+ bool folio_started;
if (xas_retry(&xas, folio))
continue;
pg_end = folio_pos(folio) + folio_size(folio) - 1;
+ folio_started = false;
for (;;) {
loff_t sreq_end;
@@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
pg_failed = true;
break;
}
- if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags))
+ if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
folio_start_fscache(folio);
+ folio_started = true;
+ }
pg_failed |= subreq_failed;
sreq_end = subreq->start + subreq->len - 1;
if (pg_end < sreq_end)
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index b6fc169be1b1..7df2503cef6c 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -209,8 +209,6 @@ config NFS_DISABLE_UDP_SUPPORT
config NFS_V4_2_READ_PLUS
bool "NFS: Enable support for the NFSv4.2 READ_PLUS operation"
depends on NFS_V4_2
- default n
+ default y
help
- This is intended for developers only. The READ_PLUS operation has
- been shown to have issues under specific conditions and should not
- be used in production.
+ Choose Y here to enable use of the NFS v4.2 READ_PLUS operation.
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 70f5563a8e81..65cbb5607a5f 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -404,7 +404,7 @@ bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
int ret, i;
d->children = kcalloc(v->concat.volumes_count,
- sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ sizeof(struct pnfs_block_dev), gfp_mask);
if (!d->children)
return -ENOMEM;
@@ -433,7 +433,7 @@ bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
int ret, i;
d->children = kcalloc(v->stripe.volumes_count,
- sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ sizeof(struct pnfs_block_dev), gfp_mask);
if (!d->children)
return -ENOMEM;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 456af7d230cf..466ebf1d41b2 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -74,23 +74,12 @@ out_err:
static int
nfs4_callback_svc(void *vrqstp)
{
- int err;
struct svc_rqst *rqstp = vrqstp;
set_freezable();
- while (!kthread_freezable_should_stop(NULL)) {
-
- if (signal_pending(current))
- flush_signals(current);
- /*
- * Listen for a request on the socket
- */
- err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT);
- if (err == -EAGAIN || err == -EINTR)
- continue;
- svc_process(rqstp);
- }
+ while (!kthread_freezable_should_stop(NULL))
+ svc_recv(rqstp);
svc_exit_thread(rqstp);
return 0;
@@ -112,11 +101,7 @@ nfs41_callback_svc(void *vrqstp)
set_freezable();
while (!kthread_freezable_should_stop(NULL)) {
-
- if (signal_pending(current))
- flush_signals(current);
-
- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -387,7 +372,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
* All other checking done after NFS decoding where the nfs_client can be
* found in nfs4_callback_compound
*/
-static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+static enum svc_auth_status nfs_callback_authenticate(struct svc_rqst *rqstp)
{
rqstp->rq_auth_stat = rpc_autherr_badcred;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c1eda73254e1..6bed1394d748 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -59,7 +59,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
res->change_attr = delegation->change_attr;
if (nfs_have_writebacks(inode))
res->change_attr++;
- res->ctime = inode->i_ctime;
+ res->ctime = inode_get_ctime(inode);
res->mtime = inode->i_mtime;
res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) &
args->bitmap[0];
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index e4c5f193ed5e..44eca51b2808 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -517,6 +517,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
.authflavor = flavor,
.cred = cl_init->cred,
.xprtsec = cl_init->xprtsec,
+ .connect_timeout = cl_init->connect_timeout,
+ .reconnect_timeout = cl_init->reconnect_timeout,
};
if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8f3112e71a6a..e6a51fd94fea 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1089,6 +1089,17 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
for (i = desc->cache_entry_index; i < array->size; i++) {
struct nfs_cache_array_entry *ent;
+ /*
+ * nfs_readdir_handle_cache_misses return force clear at
+ * (cache_misses > NFS_READDIR_CACHE_MISS_THRESHOLD) for
+ * readdir heuristic, NFS_READDIR_CACHE_MISS_THRESHOLD + 1
+ * entries need be emitted here.
+ */
+ if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 2) {
+ desc->eob = true;
+ break;
+ }
+
ent = &array->array[i];
if (!dir_emit(desc->ctx, ent->name, ent->name_len,
nfs_compat_user_ino64(ent->ino), ent->d_type)) {
@@ -1107,10 +1118,6 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc,
desc->ctx->pos = desc->dir_cookie;
else
desc->ctx->pos++;
- if (first_emit && i > NFS_READDIR_CACHE_MISS_THRESHOLD + 1) {
- desc->eob = true;
- break;
- }
}
if (array->folio_is_eof)
desc->eof = !desc->eob;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9a18c5a69ace..f6c74f424691 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
dreq->max_count = dreq_len;
if (dreq->count > dreq_len)
dreq->count = dreq_len;
-
- if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
- dreq->error = hdr->error;
- else /* Clear outstanding error if this is EOF */
- dreq->error = 0;
}
+
+ if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error)
+ dreq->error = hdr->error;
}
static void
@@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
dreq->count = dreq_len;
}
+static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
+ struct nfs_page *req)
+{
+ loff_t offs = req_offset(req);
+ size_t req_start = (size_t)(offs - dreq->io_start);
+
+ if (req_start < dreq->max_count)
+ dreq->max_count = req_start;
+ if (req_start < dreq->count)
+ dreq->count = req_start;
+}
+
/**
* nfs_swap_rw - NFS address space operation for swap I/O
* @iocb: target I/O control block
@@ -472,21 +482,47 @@ out:
return result;
}
-static void
-nfs_direct_join_group(struct list_head *list, struct inode *inode)
+static void nfs_direct_add_page_head(struct list_head *list,
+ struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ if (!list_empty(&head->wb_list) || !nfs_lock_request(head))
+ return;
+ if (!list_empty(&head->wb_list)) {
+ nfs_unlock_request(head);
+ return;
+ }
+ list_add(&head->wb_list, list);
+ kref_get(&head->wb_kref);
+ kref_get(&head->wb_kref);
+}
+
+static void nfs_direct_join_group(struct list_head *list,
+ struct nfs_commit_info *cinfo,
+ struct inode *inode)
{
- struct nfs_page *req, *next;
+ struct nfs_page *req, *subreq;
list_for_each_entry(req, list, wb_list) {
- if (req->wb_head != req || req->wb_this_page == req)
+ if (req->wb_head != req) {
+ nfs_direct_add_page_head(&req->wb_list, req);
continue;
- for (next = req->wb_this_page;
- next != req->wb_head;
- next = next->wb_this_page) {
- nfs_list_remove_request(next);
- nfs_release_request(next);
}
- nfs_join_page_group(req, inode);
+ subreq = req->wb_this_page;
+ if (subreq == req)
+ continue;
+ do {
+ /*
+ * Remove subrequests from this list before freeing
+ * them in the call to nfs_join_page_group().
+ */
+ if (!list_empty(&subreq->wb_list)) {
+ nfs_list_remove_request(subreq);
+ nfs_release_request(subreq);
+ }
+ } while ((subreq = subreq->wb_this_page) != req);
+ nfs_join_page_group(req, cinfo, inode);
}
}
@@ -504,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
- struct nfs_page *req, *tmp;
+ struct nfs_page *req;
LIST_HEAD(reqs);
struct nfs_commit_info cinfo;
- LIST_HEAD(failed);
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
- nfs_direct_join_group(&reqs, dreq->inode);
+ nfs_direct_join_group(&reqs, &cinfo, dreq->inode);
- dreq->count = 0;
- dreq->max_count = 0;
- list_for_each_entry(req, &reqs, wb_list)
- dreq->max_count += req->wb_bytes;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
get_dreq(dreq);
@@ -525,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
- list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
/* Bump the transmission count */
req->wb_nio++;
if (!nfs_pageio_add_request(&desc, req)) {
- nfs_list_move_request(req, &failed);
- spin_lock(&cinfo.inode->i_lock);
- dreq->flags = 0;
- if (desc.pg_error < 0)
+ spin_lock(&dreq->lock);
+ if (dreq->error < 0) {
+ desc.pg_error = dreq->error;
+ } else if (desc.pg_error != -EAGAIN) {
+ dreq->flags = 0;
+ if (!desc.pg_error)
+ desc.pg_error = -EIO;
dreq->error = desc.pg_error;
- else
- dreq->error = -EIO;
- spin_unlock(&cinfo.inode->i_lock);
+ } else
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ break;
}
nfs_release_request(req);
}
nfs_pageio_complete(&desc);
- while (!list_empty(&failed)) {
- req = nfs_list_entry(failed.next);
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
nfs_list_remove_request(req);
nfs_unlock_and_release_request(req);
+ if (desc.pg_error == -EAGAIN) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ } else {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ }
}
if (put_dreq(dreq))
@@ -565,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
if (status < 0) {
/* Errors in commit are fatal */
dreq->error = status;
- dreq->max_count = 0;
- dreq->count = 0;
dreq->flags = NFS_ODIRECT_DONE;
} else {
status = dreq->error;
@@ -577,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+ if (status < 0) {
+ spin_lock(&dreq->lock);
+ nfs_direct_truncate_request(dreq, req);
+ spin_unlock(&dreq->lock);
+ nfs_release_request(req);
+ } else if (!nfs_write_match_verf(verf, req)) {
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
/*
* Despite the reboot, the write was successful,
@@ -585,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
*/
req->wb_nio = 0;
nfs_mark_request_commit(req, NULL, &cinfo, 0);
- } else /* Error or match */
+ } else
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -638,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
while (!list_empty(&reqs)) {
req = nfs_list_entry(reqs.next);
nfs_list_remove_request(req);
+ nfs_direct_truncate_request(dreq, req);
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -687,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) {
+ if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) &&
+ !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
if (!dreq->flags)
dreq->flags = NFS_ODIRECT_DO_COMMIT;
flags = dreq->flags;
@@ -731,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error)
static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
+ struct nfs_page *req;
+ struct nfs_commit_info cinfo;
trace_nfs_direct_write_reschedule_io(dreq);
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
spin_lock(&dreq->lock);
- if (dreq->error == 0) {
+ if (dreq->error == 0)
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- /* fake unstable write to let common nfs resend pages */
- hdr->verf.committed = NFS_UNSTABLE;
- hdr->good_bytes = hdr->args.offset + hdr->args.count -
- hdr->io_start;
- }
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
spin_unlock(&dreq->lock);
+ while (!list_empty(&hdr->pages)) {
+ req = nfs_list_entry(hdr->pages.next);
+ nfs_list_remove_request(req);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ }
}
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
@@ -770,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
+ struct nfs_commit_info cinfo;
ssize_t result = 0;
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
+ bool defer = false;
trace_nfs_direct_write_schedule_iovec(dreq);
@@ -813,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
break;
}
- nfs_lock_request(req);
- if (!nfs_pageio_add_request(&desc, req)) {
- result = desc.pg_error;
- nfs_unlock_and_release_request(req);
- break;
- }
pgbase = 0;
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
dreq->bytes_left -= req_len;
+
+ if (defer) {
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ continue;
+ }
+
+ nfs_lock_request(req);
+ if (nfs_pageio_add_request(&desc, req))
+ continue;
+
+ /* Exit on hard errors */
+ if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+
+ /* If the error is soft, defer remaining requests */
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ spin_lock(&dreq->lock);
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ spin_unlock(&dreq->lock);
+ nfs_unlock_request(req);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
+ desc.pg_error = 0;
+ defer = true;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 6603b5cee029..714975e5c0db 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -7,14 +7,16 @@
* Resolves DNS hostnames into valid ip addresses
*/
-#ifdef CONFIG_NFS_USE_KERNEL_DNS
-
#include <linux/module.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/addr.h>
-#include <linux/dns_resolver.h>
+
#include "dns_resolve.h"
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+
+#include <linux/dns_resolver.h>
+
ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
struct sockaddr_storage *ss, size_t salen)
{
@@ -35,7 +37,6 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
#else
-#include <linux/module.h>
#include <linux/hash.h>
#include <linux/string.h>
#include <linux/kmod.h>
@@ -43,15 +44,12 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
#include <linux/socket.h>
#include <linux/seq_file.h>
#include <linux/inet.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
#include <linux/nfs_fs.h>
#include "nfs4_fs.h"
-#include "dns_resolve.h"
#include "cache_lib.h"
#include "netns.h"
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 79b1b3fcd3fc..3f9768810427 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -200,7 +200,7 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe
EXPORT_SYMBOL_GPL(nfs_file_splice_read);
int
-nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
+nfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
int status;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 7deb3cd76abe..ef817a0475ff 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
case -EOPNOTSUPP:
+ case -EINVAL:
case -ECONNREFUSED:
case -ECONNRESET:
case -EHOSTDOWN:
@@ -2519,9 +2520,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
return i;
}
-static int
-ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
+static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
{
+ struct pnfs_layout_hdr *lo;
struct nfs4_flexfile_layout *ff_layout;
const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
@@ -2532,11 +2533,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
return -ENOMEM;
spin_lock(&args->inode->i_lock);
- ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
- args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
- &args->devinfo[0],
- dev_count,
- NFS4_FF_OP_LAYOUTSTATS);
+ lo = NFS_I(args->inode)->layout;
+ if (lo && pnfs_layout_is_valid(lo)) {
+ ff_layout = FF_LAYOUT_FROM_HDR(lo);
+ args->num_dev = ff_layout_mirror_prepare_stats(
+ &ff_layout->generic_hdr, &args->devinfo[0], dev_count,
+ NFS4_FF_OP_LAYOUTSTATS);
+ } else
+ args->num_dev = 0;
spin_unlock(&args->inode->i_lock);
if (!args->num_dev) {
kfree(args->devinfo);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 8c35d88a84b1..b05717fe0d4e 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode)
&auxdata, /* aux_data */
sizeof(auxdata),
i_size_read(inode));
+
+ if (netfs_inode(inode)->cache)
+ mapping_set_release_always(inode->i_mapping);
}
/*
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index e1706e736c64..2dc64454492b 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -116,8 +116,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *
memset(auxdata, 0, sizeof(*auxdata));
auxdata->mtime_sec = inode->i_mtime.tv_sec;
auxdata->mtime_nsec = inode->i_mtime.tv_nsec;
- auxdata->ctime_sec = inode->i_ctime.tv_sec;
- auxdata->ctime_nsec = inode->i_ctime.tv_nsec;
+ auxdata->ctime_sec = inode_get_ctime(inode).tv_sec;
+ auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec;
if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4)
auxdata->change_attr = inode_peek_iversion_raw(inode);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8172dd4135a1..e21c073158e5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -514,7 +514,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
memset(&inode->i_atime, 0, sizeof(inode->i_atime));
memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
- memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+ inode_set_ctime(inode, 0, 0);
inode_set_iversion_raw(inode, 0);
inode->i_size = 0;
clear_nlink(inode);
@@ -535,7 +535,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
else if (fattr_supported & NFS_ATTR_FATTR_MTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
@@ -731,7 +731,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
if ((attr->ia_valid & ATTR_GID) != 0)
inode->i_gid = attr->ia_gid;
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME);
@@ -749,7 +749,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME);
@@ -765,7 +765,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME);
@@ -912,7 +912,7 @@ out_no_revalidate:
/* Only return attributes that were revalidated. */
stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
stat->change_cookie = inode_peek_iversion_raw(inode);
stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC;
@@ -1444,11 +1444,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR);
}
/* If we have atomic WCC data, we may update some attributes */
- ts = inode->i_ctime;
+ ts = inode_get_ctime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
&& (fattr->valid & NFS_ATTR_FATTR_CTIME)
&& timespec64_equal(&ts, &fattr->pre_ctime)) {
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
}
ts = inode->i_mtime;
@@ -1510,7 +1510,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime))
invalid |= NFS_INO_INVALID_MTIME;
- ts = inode->i_ctime;
+ ts = inode_get_ctime(inode);
if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime))
invalid |= NFS_INO_INVALID_CTIME;
@@ -1997,7 +1997,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa
}
if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
(fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
- fattr->pre_ctime = inode->i_ctime;
+ fattr->pre_ctime = inode_get_ctime(inode);
fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
}
if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
@@ -2190,7 +2190,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
save_cache_validity & NFS_INO_INVALID_MTIME;
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
- inode->i_ctime = fattr->ctime;
+ inode_set_ctime_to_ts(inode, fattr->ctime);
else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_CTIME;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 913c09806c7f..9c9cf764f600 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -82,6 +82,8 @@ struct nfs_client_initdata {
const struct rpc_timeout *timeparms;
const struct cred *cred;
struct xprtsec_parms xprtsec;
+ unsigned long connect_timeout;
+ unsigned long reconnect_timeout;
};
/*
@@ -493,6 +495,7 @@ extern const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
struct inode *inode, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops);
+extern bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size);
extern int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
struct nfs_open_context *ctx,
struct folio *folio);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 19d51ebf842c..e7494cdd957e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -215,7 +215,8 @@ nfs_namespace_getattr(struct mnt_idmap *idmap,
if (NFS_FH(d_inode(path->dentry))->size != 0)
return nfs_getattr(idmap, path, stat, request_mask,
query_flags);
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
return 0;
}
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 05c3b4b2b3dd..c19093814296 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -949,7 +949,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_filename_inline(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return -EAGAIN;
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
/*
* The type (size and byte order) of nfscookie isn't defined in
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index eff3802c5e03..674c012868b1 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -86,6 +86,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
{
struct rpc_timeout ds_timeout;
+ unsigned long connect_timeout = ds_timeo * (ds_retrans + 1) * HZ / 10;
struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
@@ -98,6 +99,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
.timeparms = &ds_timeout,
.cred = mds_srv->cred,
.xprtsec = mds_clp->cl_xprtsec,
+ .connect_timeout = connect_timeout,
+ .reconnect_timeout = connect_timeout,
};
struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 3b0b650c9c5a..60f032be805a 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1991,7 +1991,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_inline_filename3(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return -EAGAIN;
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
error = decode_cookie3(xdr, &new_cookie);
if (unlikely(error))
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 0fe5aacbcfdf..b59876b01a1e 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -13,6 +13,7 @@
* more? Need to consider not to pre-alloc too much for a compound.
*/
#define PNFS_LAYOUTSTATS_MAXDEV (4)
+#define READ_PLUS_SCRATCH_SIZE (16)
/* nfs4.2proc.c */
#ifdef CONFIG_NFS_V4_2
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 63802d195556..28704f924612 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -81,7 +81,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
if (status == 0) {
if (nfs_should_remove_suid(inode)) {
spin_lock(&inode->i_lock);
- nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE);
+ nfs_set_cache_invalid(inode,
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE);
spin_unlock(&inode->i_lock);
}
status = nfs_post_op_update_inode_force_wcc(inode,
@@ -471,8 +472,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src,
continue;
}
break;
- } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && !args.sync) {
- args.sync = true;
+ } else if (err == -NFS4ERR_OFFLOAD_NO_REQS &&
+ args.sync != res.synchronous) {
+ args.sync = res.synchronous;
dst_exception.retry = 1;
continue;
} else if ((err == -ESTALE ||
@@ -1377,7 +1379,6 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
for (i = 0; i < np; i++) {
pages[i] = alloc_page(GFP_KERNEL);
if (!pages[i]) {
- np = i + 1;
err = -ENOMEM;
goto out;
}
@@ -1401,8 +1402,8 @@ ssize_t nfs42_proc_getxattr(struct inode *inode, const char *name,
} while (exception.retry);
out:
- while (--np >= 0)
- __free_page(pages[np]);
+ while (--i >= 0)
+ __free_page(pages[i]);
kfree(pages);
return err;
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 95234208dc9e..9e3ae53e2205 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -54,10 +54,16 @@
(1 /* data_content4 */ + \
2 /* data_info4.di_offset */ + \
1 /* data_info4.di_length */)
+#define NFS42_READ_PLUS_HOLE_SEGMENT_SIZE \
+ (1 /* data_content4 */ + \
+ 2 /* data_info4.di_offset */ + \
+ 2 /* data_info4.di_length */)
+#define READ_PLUS_SEGMENT_SIZE_DIFF (NFS42_READ_PLUS_HOLE_SEGMENT_SIZE - \
+ NFS42_READ_PLUS_DATA_SEGMENT_SIZE)
#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \
1 /* rpr_eof */ + \
1 /* rpr_contents count */ + \
- NFS42_READ_PLUS_DATA_SEGMENT_SIZE)
+ NFS42_READ_PLUS_HOLE_SEGMENT_SIZE)
#define encode_seek_maxsz (op_encode_hdr_maxsz + \
encode_stateid_maxsz + \
2 /* offset */ + \
@@ -617,8 +623,8 @@ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
encode_putfh(xdr, args->fh, &hdr);
encode_read_plus(xdr, args, &hdr);
- rpc_prepare_reply_pages(req, args->pages, args->pgbase,
- args->count, hdr.replen);
+ rpc_prepare_reply_pages(req, args->pages, args->pgbase, args->count,
+ hdr.replen - READ_PLUS_SEGMENT_SIZE_DIFF);
encode_nops(&hdr);
}
@@ -1056,13 +1062,12 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res)
res->eof = be32_to_cpup(p++);
segments = be32_to_cpup(p++);
if (segments == 0)
- return status;
+ return 0;
segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL);
if (!segs)
return -ENOMEM;
- status = -EIO;
for (i = 0; i < segments; i++) {
status = decode_read_plus_segment(xdr, &segs[i]);
if (status < 0)
@@ -1428,7 +1433,7 @@ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
struct compound_hdr hdr;
int status;
- xdr_set_scratch_buffer(xdr, res->scratch, sizeof(res->scratch));
+ xdr_set_scratch_buffer(xdr, res->scratch, READ_PLUS_SCRATCH_SIZE);
status = decode_compound_hdr(xdr, &hdr);
if (status)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4c9f8bd866ab..47c5c1f86d66 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -328,7 +328,7 @@ extern int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
const nfs4_stateid *deleg_stateid,
fmode_t fmode);
-extern int nfs4_proc_setlease(struct file *file, long arg,
+extern int nfs4_proc_setlease(struct file *file, int arg,
struct file_lock **lease, void **priv);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
struct nfs_fsinfo *fsinfo);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index d9114a754db7..11e3a285594c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -232,6 +232,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
+ if (test_bit(NFS_CS_DS, &cl_init->init_flags))
+ __set_bit(NFS_CS_DS, &clp->cl_flags);
/*
* Set up the connection to the server before we add add to the
* global list.
@@ -415,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
.net = old->cl_net,
.servername = old->cl_hostname,
};
+ int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ?
+ clp->cl_max_connect : old->cl_max_connect;
if (clp->cl_proto != old->cl_proto)
return;
@@ -428,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old)
xprt_args.addrlen = clp_salen;
rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args,
- rpc_clnt_test_and_add_xprt, NULL);
+ rpc_clnt_test_and_add_xprt, &max_connect);
}
/**
@@ -1007,6 +1011,9 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ __set_bit(NFS_CS_DS, &cl_init.init_flags);
+ __set_bit(NFS_CS_PNFS, &cl_init.init_flags);
+ cl_init.max_connect = NFS_MAX_TRANSPORTS;
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
* cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 4aeadd6e1a6d..02788c3c85e5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -438,7 +438,7 @@ void nfs42_ssc_unregister_ops(void)
}
#endif /* CONFIG_NFS_V4_2 */
-static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease,
+static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease,
void **priv)
{
return nfs4_proc_setlease(file, arg, lease, priv);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e1a886b58354..5ee283eb9660 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2703,8 +2703,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data,
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) {
+ struct nfs_fh *fh = &o_res->fh;
+
nfs4_sequence_free_slot(&o_res->seq_res);
- nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL);
+ if (o_arg->claim == NFS4_OPEN_CLAIM_FH)
+ fh = NFS_FH(d_inode(data->dentry));
+ nfs4_proc_getattr(server, fh, o_res->f_attr, NULL);
}
return 0;
}
@@ -5438,18 +5442,8 @@ static bool nfs4_read_plus_not_supported(struct rpc_task *task,
return false;
}
-static inline void nfs4_read_plus_scratch_free(struct nfs_pgio_header *hdr)
-{
- if (hdr->res.scratch) {
- kfree(hdr->res.scratch);
- hdr->res.scratch = NULL;
- }
-}
-
static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- nfs4_read_plus_scratch_free(hdr);
-
if (!nfs4_sequence_done(task, &hdr->res.seq_res))
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &hdr->args))
@@ -5469,8 +5463,7 @@ static bool nfs42_read_plus_support(struct nfs_pgio_header *hdr,
/* Note: We don't use READ_PLUS with pNFS yet */
if (nfs_server_capable(hdr->inode, NFS_CAP_READ_PLUS) && !hdr->ds_clp) {
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS];
- hdr->res.scratch = kmalloc(32, GFP_KERNEL);
- return hdr->res.scratch != NULL;
+ return nfs_read_alloc_scratch(hdr, READ_PLUS_SCRATCH_SIZE);
}
return false;
}
@@ -6004,9 +5997,8 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf,
out_ok:
ret = res.acl_len;
out_free:
- for (i = 0; i < npages; i++)
- if (pages[i])
- __free_page(pages[i]);
+ while (--i >= 0)
+ __free_page(pages[i]);
if (res.acl_scratch)
__free_page(res.acl_scratch);
kfree(pages);
@@ -7181,8 +7173,15 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
} else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
goto out_restart;
break;
- case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OLD_STATEID:
+ if (data->arg.new_lock_owner != 0 &&
+ nfs4_refresh_open_old_stateid(&data->arg.open_stateid,
+ lsp->ls_state))
+ goto out_restart;
+ if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp))
+ goto out_restart;
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
if (data->arg.new_lock_owner != 0) {
@@ -7573,7 +7572,7 @@ static int nfs4_delete_lease(struct file *file, void **priv)
return generic_setlease(file, F_UNLCK, NULL, priv);
}
-static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease,
+static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
void **priv)
{
struct inode *inode = file_inode(file);
@@ -7591,7 +7590,7 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease,
return -EAGAIN;
}
-int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease,
+int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease,
void **priv)
{
switch (arg) {
@@ -8792,6 +8791,8 @@ nfs4_run_exchange_id(struct nfs_client *clp, const struct cred *cred,
#ifdef CONFIG_NFS_V4_1_MIGRATION
calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR;
#endif
+ if (test_bit(NFS_CS_DS, &clp->cl_flags))
+ calldata->args.flags |= EXCHGID4_FLAG_USE_PNFS_DS;
msg.rpc_argp = &calldata->args;
msg.rpc_resp = &calldata->res;
task_setup_data.callback_data = calldata;
@@ -10619,7 +10620,9 @@ static void nfs4_disable_swap(struct inode *inode)
*/
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
- nfs4_schedule_state_manager(clp);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ wake_up_var(&clp->cl_state);
}
static const struct inode_operations nfs4_dir_inode_operations = {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e079987af4a3..9a5d911a7edc 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1];
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
+ bool swapon = false;
- if (clp->cl_rpcclient->cl_shutdown)
+ if (clnt->cl_shutdown)
return;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
- if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) {
- wake_up_var(&clp->cl_state);
- return;
+
+ if (atomic_read(&clnt->cl_swapper)) {
+ swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE,
+ &clp->cl_state);
+ if (!swapon) {
+ wake_up_var(&clp->cl_state);
+ return;
+ }
}
- set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
+
+ if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ return;
+
__module_get(THIS_MODULE);
refcount_inc(&clp->cl_count);
@@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
__func__, PTR_ERR(task));
if (!nfs_client_init_is_complete(clp))
nfs_mark_client_ready(clp, PTR_ERR(task));
+ if (swapon)
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs4_clear_state_manager_bit(clp);
- clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
nfs_put_client(clp);
module_put(THIS_MODULE);
}
@@ -2703,6 +2714,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
+ if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
+ &clp->cl_state)) {
+ memflags = memalloc_nofs_save();
+ continue;
+ }
+
if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
nfs_client_return_marked_delegations(clp);
@@ -2741,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr)
allow_signal(SIGKILL);
again:
- set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
nfs4_state_manager(clp);
- if (atomic_read(&cl->cl_swapper)) {
+
+ if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) {
wait_var_event_interruptible(&clp->cl_state,
test_bit(NFS4CLNT_RUN_MANAGER,
&clp->cl_state));
- if (atomic_read(&cl->cl_swapper) &&
- test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state))
+ if (!atomic_read(&cl->cl_swapper))
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
+ if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
goto again;
/* Either no longer a swapper, or were signalled */
+ clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
}
- clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state);
if (refcount_read(&clp->cl_count) > 1 && !signalled() &&
test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
- !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state))
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state))
goto again;
nfs_put_client(clp);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 306cba0b9e69..84343aefbbd6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2634,31 +2634,44 @@ pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
return mode == 0;
}
-static int
-pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
+static int pnfs_layout_return_unused_byserver(struct nfs_server *server,
+ void *data)
{
const struct pnfs_layout_range *range = data;
+ const struct cred *cred;
struct pnfs_layout_hdr *lo;
struct inode *inode;
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+
restart:
rcu_read_lock();
list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
- if (!pnfs_layout_can_be_returned(lo) ||
+ inode = lo->plh_inode;
+ if (!inode || !pnfs_layout_can_be_returned(lo) ||
test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
continue;
- inode = lo->plh_inode;
spin_lock(&inode->i_lock);
- if (!pnfs_should_return_unused_layout(lo, range)) {
+ if (!lo->plh_inode ||
+ !pnfs_should_return_unused_layout(lo, range)) {
spin_unlock(&inode->i_lock);
continue;
}
+ pnfs_get_layout_hdr(lo);
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ range, 0) != 0 ||
+ !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) {
+ spin_unlock(&inode->i_lock);
+ rcu_read_unlock();
+ pnfs_put_layout_hdr(lo);
+ cond_resched();
+ goto restart;
+ }
spin_unlock(&inode->i_lock);
- inode = pnfs_grab_inode_layout_hdr(lo);
- if (!inode)
- continue;
rcu_read_unlock();
- pnfs_mark_layout_for_return(inode, range);
- iput(inode);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
+ pnfs_put_layout_hdr(lo);
cond_resched();
goto restart;
}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index ddbbf4fcda86..178001c90156 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -154,7 +154,7 @@ nfs4_get_device_info(struct nfs_server *server,
set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
out_free_pages:
- for (i = 0; i < max_pages; i++)
+ while (--i >= 0)
__free_page(pages[i]);
kfree(pages);
out_free_pdev:
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index a0112ad4937a..afd23910f3bf 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -852,6 +852,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
{
struct nfs_client *clp = ERR_PTR(-EIO);
struct nfs4_pnfs_ds_addr *da;
+ unsigned long connect_timeout = timeo * (retrans + 1) * HZ / 10;
int status = 0;
dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
@@ -870,6 +871,8 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
.dstaddr = (struct sockaddr *)&da->da_addr,
.addrlen = da->da_addrlen,
.servername = clp->cl_hostname,
+ .connect_timeout = connect_timeout,
+ .reconnect_timeout = connect_timeout,
};
if (da->da_transport != clp->cl_proto)
@@ -943,7 +946,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
* Test this address for session trunking and
* add as an alias
*/
- xprtdata.cred = nfs4_get_clid_cred(clp),
+ xprtdata.cred = nfs4_get_clid_cred(clp);
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
rpc_clnt_setup_test_and_add_xprt,
&rpcdata);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index f71eeee67e20..7dc21a48e3e7 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -47,6 +47,8 @@ static struct nfs_pgio_header *nfs_readhdr_alloc(void)
static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
{
+ if (rhdr->res.scratch != NULL)
+ kfree(rhdr->res.scratch);
kmem_cache_free(nfs_rdata_cachep, rhdr);
}
@@ -108,6 +110,14 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
+bool nfs_read_alloc_scratch(struct nfs_pgio_header *hdr, size_t size)
+{
+ WARN_ON(hdr->res.scratch != NULL);
+ hdr->res.scratch = kmalloc(size, GFP_KERNEL);
+ return hdr->res.scratch != NULL;
+}
+EXPORT_SYMBOL_GPL(nfs_read_alloc_scratch);
+
static void nfs_readpage_release(struct nfs_page *req, int error)
{
struct folio *folio = nfs_page_to_folio(req);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2284f749d892..0d6473cb00cb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1339,15 +1339,13 @@ error_splat_super:
void nfs_kill_super(struct super_block *s)
{
struct nfs_server *server = NFS_SB(s);
- dev_t dev = s->s_dev;
nfs_sysfs_move_sb_to_server(server);
- generic_shutdown_super(s);
+ kill_anon_super(s);
nfs_fscache_release_super_cookie(s);
nfs_free_server(server);
- free_anon_bdev(dev);
}
EXPORT_SYMBOL_GPL(nfs_kill_super);
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index acda8f033d30..bf378ecd5d9f 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -345,8 +345,10 @@ void nfs_sysfs_move_sb_to_server(struct nfs_server *server)
int ret = -ENOMEM;
s = kasprintf(GFP_KERNEL, "server-%d", server->s_sysfs_id);
- if (s)
+ if (s) {
ret = kobject_rename(&server->kobj, s);
+ kfree(s);
+ }
if (ret < 0)
pr_warn("NFS: rename sysfs %s failed (%d)\n",
server->kobj.name, ret);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f4cca8f00c0c..9d82d50ce0b1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
static const struct nfs_rw_ops nfs_rw_write_ops;
static void nfs_inode_remove_request(struct nfs_page *req);
-static void nfs_clear_request_commit(struct nfs_page *req);
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req);
static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
struct inode *inode);
static struct nfs_page *
@@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
* the (former) group. All subrequests are removed from any write or commit
* lists, unlinked from the group and destroyed.
*/
-void
-nfs_join_page_group(struct nfs_page *head, struct inode *inode)
+void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo,
+ struct inode *inode)
{
struct nfs_page *subreq;
struct nfs_page *destroy_list = NULL;
@@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode)
* Commit list removal accounting is done after locks are dropped */
subreq = head;
do {
- nfs_clear_request_commit(subreq);
+ nfs_clear_request_commit(cinfo, subreq);
subreq = subreq->wb_this_page;
} while (subreq != head);
@@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
{
struct inode *inode = folio_file_mapping(folio)->host;
struct nfs_page *head;
+ struct nfs_commit_info cinfo;
int ret;
+ nfs_init_cinfo_from_inode(&cinfo, inode);
/*
* A reference is taken only on the head request which acts as a
* reference to the whole page group - the group will not be destroyed
@@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio)
return ERR_PTR(ret);
}
- nfs_join_page_group(head, inode);
+ nfs_join_page_group(head, &cinfo, inode);
return head;
}
@@ -785,6 +788,8 @@ static void nfs_inode_add_request(struct nfs_page *req)
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
+ struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
+
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
struct folio *folio = nfs_page_to_folio(req->wb_head);
struct address_space *mapping = folio_file_mapping(folio);
@@ -799,8 +804,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
+ atomic_long_dec(&nfsi->nrequests);
nfs_release_request(req);
- atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests);
}
}
@@ -955,18 +960,16 @@ static void nfs_folio_clear_commit(struct folio *folio)
}
/* Called holding the request lock on @req */
-static void
-nfs_clear_request_commit(struct nfs_page *req)
+static void nfs_clear_request_commit(struct nfs_commit_info *cinfo,
+ struct nfs_page *req)
{
if (test_bit(PG_CLEAN, &req->wb_flags)) {
struct nfs_open_context *ctx = nfs_req_openctx(req);
struct inode *inode = d_inode(ctx->dentry);
- struct nfs_commit_info cinfo;
- nfs_init_cinfo_from_inode(&cinfo, inode);
mutex_lock(&NFS_I(inode)->commit_mutex);
- if (!pnfs_clear_request_commit(req, &cinfo)) {
- nfs_request_remove_commit_list(req, &cinfo);
+ if (!pnfs_clear_request_commit(req, cinfo)) {
+ nfs_request_remove_commit_list(req, cinfo);
}
mutex_unlock(&NFS_I(inode)->commit_mutex);
nfs_folio_clear_commit(nfs_page_to_folio(req));
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 8e9c1a0f8d38..1ed2f691ebb9 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -83,6 +83,15 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
int len = sizeof(__be32), ret, i;
__be32 *p;
+ /*
+ * See paragraph 5 of RFC 8881 S18.40.3.
+ */
+ if (!gdp->gd_maxcount) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
p = xdr_reserve_space(xdr, len + sizeof(__be32));
if (!p)
return nfserr_resource;
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 4c9b87850ab1..929248c6ca84 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -19,7 +19,7 @@
* typical sockaddr_storage. This is for space reasons, since sockaddr_storage
* is much larger than a sockaddr_in6.
*/
-struct svc_cacherep {
+struct nfsd_cacherep {
struct {
/* Keep often-read xid, csum in the same cache line: */
__be32 k_xid;
@@ -84,8 +84,10 @@ int nfsd_net_reply_cache_init(struct nfsd_net *nn);
void nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
int nfsd_reply_cache_init(struct nfsd_net *);
void nfsd_reply_cache_shutdown(struct nfsd_net *);
-int nfsd_cache_lookup(struct svc_rqst *);
-void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+int nfsd_cache_lookup(struct svc_rqst *rqstp,
+ struct nfsd_cacherep **cacherep);
+void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
+ int cachetype, __be32 *statp);
int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
index e81d2a5cf381..bb205328e043 100644
--- a/fs/nfsd/flexfilelayoutxdr.c
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -85,6 +85,15 @@ nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
int addr_len;
__be32 *p;
+ /*
+ * See paragraph 5 of RFC 8881 S18.40.3.
+ */
+ if (!gdp->gd_maxcount) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
/* len + padding for two strings */
addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len;
ver_len = 20;
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index fc8d5b7db9f8..268ef57751c4 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -307,7 +307,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!IS_POSIXACL(inode))
iap->ia_mode &= ~current_umask();
- fh_fill_pre_attrs(fhp);
+ status = fh_fill_pre_attrs(fhp);
+ if (status != nfs_ok)
+ goto out;
host_err = vfs_create(&nop_mnt_idmap, inode, child, iap->ia_mode, true);
if (host_err < 0) {
status = nfserrno(host_err);
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 518203821790..96e786b5e544 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -441,7 +441,7 @@ struct posix_ace_state_array {
* calculated so far: */
struct posix_acl_state {
- int empty;
+ unsigned char valid;
struct posix_ace_state owner;
struct posix_ace_state group;
struct posix_ace_state other;
@@ -457,7 +457,6 @@ init_state(struct posix_acl_state *state, int cnt)
int alloc;
memset(state, 0, sizeof(struct posix_acl_state));
- state->empty = 1;
/*
* In the worst case, each individual acl could be for a distinct
* named user or group, but we don't know which, so we allocate
@@ -500,7 +499,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
* and effective cases: when there are no inheritable ACEs,
* calls ->set_acl with a NULL ACL structure.
*/
- if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
+ if (!state->valid && (flags & NFS4_ACL_TYPE_DEFAULT))
return NULL;
/*
@@ -622,11 +621,12 @@ static void process_one_v4_ace(struct posix_acl_state *state,
struct nfs4_ace *ace)
{
u32 mask = ace->access_mask;
+ short type = ace2type(ace);
int i;
- state->empty = 0;
+ state->valid |= type;
- switch (ace2type(ace)) {
+ switch (type) {
case ACL_USER_OBJ:
if (ace->type == NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE) {
allow_bits(&state->owner, mask);
@@ -726,6 +726,30 @@ static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
if (!(ace->flag & NFS4_ACE_INHERIT_ONLY_ACE))
process_one_v4_ace(&effective_acl_state, ace);
}
+
+ /*
+ * At this point, the default ACL may have zeroed-out entries for owner,
+ * group and other. That usually results in a non-sensical resulting ACL
+ * that denies all access except to any ACE that was explicitly added.
+ *
+ * The setfacl command solves a similar problem with this logic:
+ *
+ * "If a Default ACL entry is created, and the Default ACL contains
+ * no owner, owning group, or others entry, a copy of the ACL
+ * owner, owning group, or others entry is added to the Default ACL."
+ *
+ * Copy any missing ACEs from the effective set, if any ACEs were
+ * explicitly set.
+ */
+ if (default_acl_state.valid) {
+ if (!(default_acl_state.valid & ACL_USER_OBJ))
+ default_acl_state.owner = effective_acl_state.owner;
+ if (!(default_acl_state.valid & ACL_GROUP_OBJ))
+ default_acl_state.group = effective_acl_state.group;
+ if (!(default_acl_state.valid & ACL_OTHER))
+ default_acl_state.other = effective_acl_state.other;
+ }
+
*pacl = posix_state_to_acl(&effective_acl_state, flags);
if (IS_ERR(*pacl)) {
ret = PTR_ERR(*pacl);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5ae670807449..4199ede0583c 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -297,12 +297,12 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
}
if (d_really_is_positive(child)) {
- status = nfs_ok;
-
/* NFSv4 protocol requires change attributes even though
* no change happened.
*/
- fh_fill_both_attrs(fhp);
+ status = fh_fill_both_attrs(fhp);
+ if (status != nfs_ok)
+ goto out;
switch (open->op_createmode) {
case NFS4_CREATE_UNCHECKED:
@@ -345,7 +345,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!IS_POSIXACL(inode))
iap->ia_mode &= ~current_umask();
- fh_fill_pre_attrs(fhp);
+ status = fh_fill_pre_attrs(fhp);
+ if (status != nfs_ok)
+ goto out;
status = nfsd4_vfs_create(fhp, child, open);
if (status != nfs_ok)
goto out;
@@ -380,6 +382,38 @@ out:
return status;
}
+/**
+ * set_change_info - set up the change_info4 for a reply
+ * @cinfo: pointer to nfsd4_change_info to be populated
+ * @fhp: pointer to svc_fh to use as source
+ *
+ * Many operations in NFSv4 require change_info4 in the reply. This function
+ * populates that from the info that we (should!) have already collected. In
+ * the event that we didn't get any pre-attrs, just zero out both.
+ */
+static void
+set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
+{
+ cinfo->atomic = (u32)(fhp->fh_pre_saved && fhp->fh_post_saved && !fhp->fh_no_atomic_attr);
+ cinfo->before_change = fhp->fh_pre_change;
+ cinfo->after_change = fhp->fh_post_change;
+
+ /*
+ * If fetching the pre-change attributes failed, then we should
+ * have already failed the whole operation. We could have still
+ * failed to fetch post-change attributes however.
+ *
+ * If we didn't get post-op attrs, just zero-out the after
+ * field since we don't know what it should be. If the pre_saved
+ * field isn't set for some reason, throw warning and just copy
+ * whatever is in the after field.
+ */
+ if (WARN_ON_ONCE(!fhp->fh_pre_saved))
+ cinfo->before_change = 0;
+ if (!fhp->fh_post_saved)
+ cinfo->after_change = cinfo->before_change + 1;
+}
+
static __be32
do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
{
@@ -424,11 +458,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
} else {
status = nfsd_lookup(rqstp, current_fh,
open->op_fname, open->op_fnamelen, *resfh);
- if (!status)
+ if (status == nfs_ok)
/* NFSv4 protocol requires change attributes even though
* no change happened.
*/
- fh_fill_both_attrs(current_fh);
+ status = fh_fill_both_attrs(current_fh);
}
if (status)
goto out;
@@ -1024,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
rename->rn_tname, rename->rn_tnamelen);
if (status)
return status;
- set_change_info(&rename->rn_sinfo, &cstate->current_fh);
- set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_sinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_tinfo, &cstate->current_fh);
return nfs_ok;
}
@@ -1313,12 +1347,11 @@ try_again:
/* found a match */
if (ni->nsui_busy) {
/* wait - and try again */
- prepare_to_wait(&nn->nfsd_ssc_waitq, &wait,
- TASK_INTERRUPTIBLE);
+ prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, TASK_IDLE);
spin_unlock(&nn->nfsd_ssc_lock);
/* allow 20secs for mount/unmount for now - revisit */
- if (signal_pending(current) ||
+ if (kthread_should_stop() ||
(schedule_timeout(20*HZ) == 0)) {
finish_wait(&nn->nfsd_ssc_waitq, &wait);
kfree(work);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3aefbad4cc09..8534693eb6a4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -649,6 +649,18 @@ find_readable_file(struct nfs4_file *f)
return ret;
}
+static struct nfsd_file *
+find_rw_file(struct nfs4_file *f)
+{
+ struct nfsd_file *ret;
+
+ spin_lock(&f->fi_lock);
+ ret = nfsd_file_get(f->fi_fds[O_RDWR]);
+ spin_unlock(&f->fi_lock);
+
+ return ret;
+}
+
struct nfsd_file *
find_any_file(struct nfs4_file *f)
{
@@ -1144,7 +1156,7 @@ static void block_delegations(struct knfsd_fh *fh)
static struct nfs4_delegation *
alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
- struct nfs4_clnt_odstate *odstate)
+ struct nfs4_clnt_odstate *odstate, u32 dl_type)
{
struct nfs4_delegation *dp;
long n;
@@ -1170,7 +1182,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
INIT_LIST_HEAD(&dp->dl_recall_lru);
dp->dl_clnt_odstate = odstate;
get_clnt_odstate(odstate);
- dp->dl_type = NFS4_OPEN_DELEGATE_READ;
+ dp->dl_type = dl_type;
dp->dl_retries = 1;
dp->dl_recalled = false;
nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
@@ -1354,9 +1366,9 @@ static void revoke_delegation(struct nfs4_delegation *dp)
trace_nfsd_stid_revoke(&dp->dl_stid);
if (clp->cl_minorversion) {
+ spin_lock(&clp->cl_lock);
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
- spin_lock(&clp->cl_lock);
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
spin_unlock(&clp->cl_lock);
}
@@ -5449,8 +5461,9 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
struct nfs4_file *fp = stp->st_stid.sc_file;
struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
struct nfs4_delegation *dp;
- struct nfsd_file *nf;
+ struct nfsd_file *nf = NULL;
struct file_lock *fl;
+ u32 dl_type;
/*
* The fi_had_conflict and nfs_get_existing_delegation checks
@@ -5460,15 +5473,35 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
- nf = find_readable_file(fp);
- if (!nf) {
- /*
- * We probably could attempt another open and get a read
- * delegation, but for now, don't bother until the
- * client actually sends us one.
- */
- return ERR_PTR(-EAGAIN);
+ /*
+ * Try for a write delegation first. RFC8881 section 10.4 says:
+ *
+ * "An OPEN_DELEGATE_WRITE delegation allows the client to handle,
+ * on its own, all opens."
+ *
+ * Furthermore the client can use a write delegation for most READ
+ * operations as well, so we require a O_RDWR file here.
+ *
+ * Offer a write delegation in the case of a BOTH open, and ensure
+ * we get the O_RDWR descriptor.
+ */
+ if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) {
+ nf = find_rw_file(fp);
+ dl_type = NFS4_OPEN_DELEGATE_WRITE;
}
+
+ /*
+ * If the file is being opened O_RDONLY or we couldn't get a O_RDWR
+ * file for some reason, then try for a read delegation instead.
+ */
+ if (!nf && (open->op_share_access & NFS4_SHARE_ACCESS_READ)) {
+ nf = find_readable_file(fp);
+ dl_type = NFS4_OPEN_DELEGATE_READ;
+ }
+
+ if (!nf)
+ return ERR_PTR(-EAGAIN);
+
spin_lock(&state_lock);
spin_lock(&fp->fi_lock);
if (nfs4_delegation_exists(clp, fp))
@@ -5491,11 +5524,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
return ERR_PTR(status);
status = -ENOMEM;
- dp = alloc_init_deleg(clp, fp, odstate);
+ dp = alloc_init_deleg(clp, fp, odstate, dl_type);
if (!dp)
goto out_delegees;
- fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ);
+ fl = nfs4_alloc_init_lease(dp, dl_type);
if (!fl)
goto out_clnt_odstate;
@@ -5568,10 +5601,28 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
}
/*
- * Attempt to hand out a delegation.
+ * The Linux NFS server does not offer write delegations to NFSv4.0
+ * clients in order to avoid conflicts between write delegations and
+ * GETATTRs requesting CHANGE or SIZE attributes.
+ *
+ * With NFSv4.1 and later minorversions, the SEQUENCE operation that
+ * begins each COMPOUND contains a client ID. Delegation recall can
+ * be avoided when the server recognizes the client sending a
+ * GETATTR also holds write delegation it conflicts with.
*
- * Note we don't support write delegations, and won't until the vfs has
- * proper support for them.
+ * However, the NFSv4.0 protocol does not enable a server to
+ * determine that a GETATTR originated from the client holding the
+ * conflicting delegation versus coming from some other client. Per
+ * RFC 7530 Section 16.7.5, the server must recall or send a
+ * CB_GETATTR even when the GETATTR originates from the client that
+ * holds the conflicting delegation.
+ *
+ * An NFSv4.0 client can trigger a pathological situation if it
+ * always sends a DELEGRETURN preceded by a conflicting GETATTR in
+ * the same COMPOUND. COMPOUND execution will always stop at the
+ * GETATTR and the DELEGRETURN will never get executed. The server
+ * eventually revokes the delegation, which can result in loss of
+ * open or lock state.
*/
static void
nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
@@ -5590,8 +5641,6 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
case NFS4_OPEN_CLAIM_PREVIOUS:
if (!cb_up)
open->op_recall = 1;
- if (open->op_delegate_type != NFS4_OPEN_DELEGATE_READ)
- goto out_no_deleg;
break;
case NFS4_OPEN_CLAIM_NULL:
parent = currentfh;
@@ -5606,6 +5655,9 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
goto out_no_deleg;
if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
goto out_no_deleg;
+ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE &&
+ !clp->cl_minorversion)
+ goto out_no_deleg;
break;
default:
goto out_no_deleg;
@@ -5616,8 +5668,13 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
- trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
- open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
+ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
+ trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
+ } else {
+ open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
+ trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
+ }
nfs4_put_stid(&dp->dl_stid);
return;
out_no_deleg:
@@ -8341,3 +8398,68 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
{
get_stateid(cstate, &u->write.wr_stateid);
}
+
+/**
+ * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
+ * @rqstp: RPC transaction context
+ * @inode: file to be checked for a conflict
+ *
+ * This function is called when there is a conflict between a write
+ * delegation and a change/size GETATTR from another client. The server
+ * must either use the CB_GETATTR to get the current values of the
+ * attributes from the client that holds the delegation or recall the
+ * delegation before replying to the GETATTR. See RFC 8881 section
+ * 18.7.4.
+ *
+ * The current implementation does not support CB_GETATTR yet. However
+ * this can avoid recalling the delegation could be added in follow up
+ * work.
+ *
+ * Returns 0 if there is no conflict; otherwise an nfs_stat
+ * code is returned.
+ */
+__be32
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
+{
+ __be32 status;
+ struct file_lock_context *ctx;
+ struct file_lock *fl;
+ struct nfs4_delegation *dp;
+
+ ctx = locks_inode_context(inode);
+ if (!ctx)
+ return 0;
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+ if (fl->fl_flags == FL_LAYOUT)
+ continue;
+ if (fl->fl_lmops != &nfsd_lease_mng_ops) {
+ /*
+ * non-nfs lease, if it's a lease with F_RDLCK then
+ * we are done; there isn't any write delegation
+ * on this inode
+ */
+ if (fl->fl_type == F_RDLCK)
+ break;
+ goto break_lease;
+ }
+ if (fl->fl_type == F_WRLCK) {
+ dp = fl->fl_owner;
+ if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
+ spin_unlock(&ctx->flc_lock);
+ return 0;
+ }
+break_lease:
+ spin_unlock(&ctx->flc_lock);
+ nfsd_stats_wdeleg_getattr_inc();
+ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+ if (status != nfserr_jukebox ||
+ !nfsd_wait_for_delegreturn(rqstp, inode))
+ return status;
+ return 0;
+ }
+ break;
+ }
+ spin_unlock(&ctx->flc_lock);
+ return 0;
+}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b30dca7de8cc..92c7dde148a4 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2984,6 +2984,11 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
if (status)
goto out;
}
+ if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+ status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
+ if (status)
+ goto out;
+ }
err = vfs_getattr(&path, &stat,
STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE,
@@ -3973,17 +3978,20 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
if (nfserr)
return nfserr;
- p = xdr_reserve_space(xdr, 32);
+
+ p = xdr_reserve_space(xdr, XDR_UNIT * 8);
if (!p)
return nfserr_resource;
*p++ = cpu_to_be32(open->op_recall);
/*
+ * Always flush on close
+ *
* TODO: space_limit's in delegations
*/
*p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
- *p++ = cpu_to_be32(~(u32)0);
- *p++ = cpu_to_be32(~(u32)0);
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
/*
* TODO: ACE's in delegations
@@ -4105,6 +4113,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
struct file *file, unsigned long maxcount)
{
struct xdr_stream *xdr = resp->xdr;
+ unsigned int base = xdr->buf->page_len & ~PAGE_MASK;
unsigned int starting_len = xdr->buf->len;
__be32 zero = xdr_zero;
__be32 nfserr;
@@ -4113,8 +4122,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
return nfserr_resource;
nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file,
- read->rd_offset, &maxcount,
- xdr->buf->page_len & ~PAGE_MASK,
+ read->rd_offset, &maxcount, base,
&read->rd_eof);
read->rd_length = maxcount;
if (nfserr)
@@ -4678,20 +4686,17 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
*p++ = cpu_to_be32(gdev->gd_layout_type);
- /* If maxcount is 0 then just update notifications */
- if (gdev->gd_maxcount != 0) {
- ops = nfsd4_layout_ops[gdev->gd_layout_type];
- nfserr = ops->encode_getdeviceinfo(xdr, gdev);
- if (nfserr) {
- /*
- * We don't bother to burden the layout drivers with
- * enforcing gd_maxcount, just tell the client to
- * come back with a bigger buffer if it's not enough.
- */
- if (xdr->buf->len + 4 > gdev->gd_maxcount)
- goto toosmall;
- return nfserr;
- }
+ ops = nfsd4_layout_ops[gdev->gd_layout_type];
+ nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+ if (nfserr) {
+ /*
+ * We don't bother to burden the layout drivers with
+ * enforcing gd_maxcount, just tell the client to
+ * come back with a bigger buffer if it's not enough.
+ */
+ if (xdr->buf->len + 4 > gdev->gd_maxcount)
+ goto toosmall;
+ return nfserr;
}
if (gdev->gd_notify_types) {
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index a8eda1c85829..80621a709510 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -84,11 +84,11 @@ nfsd_hashsize(unsigned int limit)
return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}
-static struct svc_cacherep *
-nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
- struct nfsd_net *nn)
+static struct nfsd_cacherep *
+nfsd_cacherep_alloc(struct svc_rqst *rqstp, __wsum csum,
+ struct nfsd_net *nn)
{
- struct svc_cacherep *rp;
+ struct nfsd_cacherep *rp;
rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
if (rp) {
@@ -110,36 +110,64 @@ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
return rp;
}
-static void
-nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
- struct nfsd_net *nn)
+static void nfsd_cacherep_free(struct nfsd_cacherep *rp)
{
- if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
- nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len);
+ if (rp->c_type == RC_REPLBUFF)
kfree(rp->c_replvec.iov_base);
+ kmem_cache_free(drc_slab, rp);
+}
+
+static unsigned long
+nfsd_cacherep_dispose(struct list_head *dispose)
+{
+ struct nfsd_cacherep *rp;
+ unsigned long freed = 0;
+
+ while (!list_empty(dispose)) {
+ rp = list_first_entry(dispose, struct nfsd_cacherep, c_lru);
+ list_del(&rp->c_lru);
+ nfsd_cacherep_free(rp);
+ freed++;
}
+ return freed;
+}
+
+static void
+nfsd_cacherep_unlink_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
+ struct nfsd_cacherep *rp)
+{
+ if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base)
+ nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len);
if (rp->c_state != RC_UNUSED) {
rb_erase(&rp->c_node, &b->rb_head);
list_del(&rp->c_lru);
atomic_dec(&nn->num_drc_entries);
nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp));
}
- kmem_cache_free(drc_slab, rp);
}
static void
-nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp,
+ struct nfsd_net *nn)
+{
+ nfsd_cacherep_unlink_locked(nn, b, rp);
+ nfsd_cacherep_free(rp);
+}
+
+static void
+nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp,
struct nfsd_net *nn)
{
spin_lock(&b->cache_lock);
- nfsd_reply_cache_free_locked(b, rp, nn);
+ nfsd_cacherep_unlink_locked(nn, b, rp);
spin_unlock(&b->cache_lock);
+ nfsd_cacherep_free(rp);
}
int nfsd_drc_slab_create(void)
{
drc_slab = kmem_cache_create("nfsd_drc",
- sizeof(struct svc_cacherep), 0, 0, NULL);
+ sizeof(struct nfsd_cacherep), 0, 0, NULL);
return drc_slab ? 0: -ENOMEM;
}
@@ -208,7 +236,7 @@ out_shrinker:
void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
{
- struct svc_cacherep *rp;
+ struct nfsd_cacherep *rp;
unsigned int i;
unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
@@ -216,7 +244,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
for (i = 0; i < nn->drc_hashsize; i++) {
struct list_head *head = &nn->drc_hashtbl[i].lru_head;
while (!list_empty(head)) {
- rp = list_first_entry(head, struct svc_cacherep, c_lru);
+ rp = list_first_entry(head, struct nfsd_cacherep, c_lru);
nfsd_reply_cache_free_locked(&nn->drc_hashtbl[i],
rp, nn);
}
@@ -233,7 +261,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
* not already scheduled.
*/
static void
-lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
+lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp)
{
rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &b->lru_head);
@@ -247,12 +275,21 @@ nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn)
return &nn->drc_hashtbl[hash];
}
-static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
- unsigned int max)
+/*
+ * Remove and return no more than @max expired entries in bucket @b.
+ * If @max is zero, do not limit the number of removed entries.
+ */
+static void
+nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b,
+ unsigned int max, struct list_head *dispose)
{
- struct svc_cacherep *rp, *tmp;
- long freed = 0;
+ unsigned long expiry = jiffies - RC_EXPIRE;
+ struct nfsd_cacherep *rp, *tmp;
+ unsigned int freed = 0;
+
+ lockdep_assert_held(&b->cache_lock);
+ /* The bucket LRU is ordered oldest-first. */
list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
/*
* Don't free entries attached to calls that are still
@@ -260,60 +297,77 @@ static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
*/
if (rp->c_state == RC_INPROG)
continue;
+
if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries &&
- time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
+ time_before(expiry, rp->c_timestamp))
break;
- nfsd_reply_cache_free_locked(b, rp, nn);
- if (max && freed++ > max)
+
+ nfsd_cacherep_unlink_locked(nn, b, rp);
+ list_add(&rp->c_lru, dispose);
+
+ if (max && ++freed > max)
break;
}
- return freed;
}
-static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
+/**
+ * nfsd_reply_cache_count - count_objects method for the DRC shrinker
+ * @shrink: our registered shrinker context
+ * @sc: garbage collection parameters
+ *
+ * Returns the total number of entries in the duplicate reply cache. To
+ * keep things simple and quick, this is not the number of expired entries
+ * in the cache (ie, the number that would be removed by a call to
+ * nfsd_reply_cache_scan).
+ */
+static unsigned long
+nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
- return prune_bucket(b, nn, 3);
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_reply_cache_shrinker);
+
+ return atomic_read(&nn->num_drc_entries);
}
-/*
- * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
- * Also prune the oldest ones when the total exceeds the max number of entries.
+/**
+ * nfsd_reply_cache_scan - scan_objects method for the DRC shrinker
+ * @shrink: our registered shrinker context
+ * @sc: garbage collection parameters
+ *
+ * Free expired entries on each bucket's LRU list until we've released
+ * nr_to_scan freed objects. Nothing will be released if the cache
+ * has not exceeded it's max_drc_entries limit.
+ *
+ * Returns the number of entries released by this call.
*/
-static long
-prune_cache_entries(struct nfsd_net *nn)
+static unsigned long
+nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
+ struct nfsd_net *nn = container_of(shrink,
+ struct nfsd_net, nfsd_reply_cache_shrinker);
+ unsigned long freed = 0;
+ LIST_HEAD(dispose);
unsigned int i;
- long freed = 0;
for (i = 0; i < nn->drc_hashsize; i++) {
struct nfsd_drc_bucket *b = &nn->drc_hashtbl[i];
if (list_empty(&b->lru_head))
continue;
+
spin_lock(&b->cache_lock);
- freed += prune_bucket(b, nn, 0);
+ nfsd_prune_bucket_locked(nn, b, 0, &dispose);
spin_unlock(&b->cache_lock);
- }
- return freed;
-}
-static unsigned long
-nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
-{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
+ freed += nfsd_cacherep_dispose(&dispose);
+ if (freed > sc->nr_to_scan)
+ break;
+ }
- return atomic_read(&nn->num_drc_entries);
+ trace_nfsd_drc_gc(nn, freed);
+ return freed;
}
-static unsigned long
-nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
-{
- struct nfsd_net *nn = container_of(shrink,
- struct nfsd_net, nfsd_reply_cache_shrinker);
-
- return prune_cache_entries(nn);
-}
/*
* Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
*/
@@ -348,8 +402,8 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
}
static int
-nfsd_cache_key_cmp(const struct svc_cacherep *key,
- const struct svc_cacherep *rp, struct nfsd_net *nn)
+nfsd_cache_key_cmp(const struct nfsd_cacherep *key,
+ const struct nfsd_cacherep *rp, struct nfsd_net *nn)
{
if (key->c_key.k_xid == rp->c_key.k_xid &&
key->c_key.k_csum != rp->c_key.k_csum) {
@@ -365,11 +419,11 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key,
* Must be called with cache_lock held. Returns the found entry or
* inserts an empty key on failure.
*/
-static struct svc_cacherep *
-nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
+static struct nfsd_cacherep *
+nfsd_cache_insert(struct nfsd_drc_bucket *b, struct nfsd_cacherep *key,
struct nfsd_net *nn)
{
- struct svc_cacherep *rp, *ret = key;
+ struct nfsd_cacherep *rp, *ret = key;
struct rb_node **p = &b->rb_head.rb_node,
*parent = NULL;
unsigned int entries = 0;
@@ -378,7 +432,7 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
while (*p != NULL) {
++entries;
parent = *p;
- rp = rb_entry(parent, struct svc_cacherep, c_node);
+ rp = rb_entry(parent, struct nfsd_cacherep, c_node);
cmp = nfsd_cache_key_cmp(key, rp, nn);
if (cmp < 0)
@@ -411,6 +465,7 @@ out:
/**
* nfsd_cache_lookup - Find an entry in the duplicate reply cache
* @rqstp: Incoming Call to find
+ * @cacherep: OUT: DRC entry for this request
*
* Try to find an entry matching the current call in the cache. When none
* is found, we try to grab the oldest expired entry off the LRU list. If
@@ -423,16 +478,17 @@ out:
* %RC_REPLY: Reply from cache
* %RC_DROPIT: Do not process the request further
*/
-int nfsd_cache_lookup(struct svc_rqst *rqstp)
+int nfsd_cache_lookup(struct svc_rqst *rqstp, struct nfsd_cacherep **cacherep)
{
struct nfsd_net *nn;
- struct svc_cacherep *rp, *found;
+ struct nfsd_cacherep *rp, *found;
__wsum csum;
struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
+ unsigned long freed;
+ LIST_HEAD(dispose);
int rtn = RC_DOIT;
- rqstp->rq_cacherep = NULL;
if (type == RC_NOCACHE) {
nfsd_stats_rc_nocache_inc();
goto out;
@@ -445,7 +501,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
* preallocate an entry.
*/
nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
+ rp = nfsd_cacherep_alloc(rqstp, csum, nn);
if (!rp)
goto out;
@@ -454,20 +510,18 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
found = nfsd_cache_insert(b, rp, nn);
if (found != rp)
goto found_entry;
-
- nfsd_stats_rc_misses_inc();
- rqstp->rq_cacherep = rp;
+ *cacherep = rp;
rp->c_state = RC_INPROG;
+ nfsd_prune_bucket_locked(nn, b, 3, &dispose);
+ spin_unlock(&b->cache_lock);
+ freed = nfsd_cacherep_dispose(&dispose);
+ trace_nfsd_drc_gc(nn, freed);
+
+ nfsd_stats_rc_misses_inc();
atomic_inc(&nn->num_drc_entries);
nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
-
- nfsd_prune_bucket(b, nn);
-
-out_unlock:
- spin_unlock(&b->cache_lock);
-out:
- return rtn;
+ goto out;
found_entry:
/* We found a matching entry which is either in progress or done. */
@@ -505,12 +559,16 @@ found_entry:
out_trace:
trace_nfsd_drc_found(nn, rqstp, rtn);
- goto out_unlock;
+out_unlock:
+ spin_unlock(&b->cache_lock);
+out:
+ return rtn;
}
/**
* nfsd_cache_update - Update an entry in the duplicate reply cache.
* @rqstp: svc_rqst with a finished Reply
+ * @rp: IN: DRC entry for this request
* @cachetype: which cache to update
* @statp: pointer to Reply's NFS status code, or NULL
*
@@ -528,10 +586,10 @@ out_trace:
* nfsd failed to encode a reply that otherwise would have been cached.
* In this case, nfsd_cache_update is called with statp == NULL.
*/
-void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+void nfsd_cache_update(struct svc_rqst *rqstp, struct nfsd_cacherep *rp,
+ int cachetype, __be32 *statp)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
- struct svc_cacherep *rp = rqstp->rq_cacherep;
struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
struct nfsd_drc_bucket *b;
int len;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 1b8b1aab9a15..7ed02fb88a36 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1105,6 +1105,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
if (!nn->nfsd_serv)
return -EBUSY;
trace_nfsd_end_grace(netns(file));
+ nfsd4_end_grace(nn);
break;
default:
return -EINVAL;
@@ -1131,7 +1132,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
/* Following advice from simple_fill_super documentation: */
inode->i_ino = iunique(sb, NFSD_MaxReserved);
inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_fop = &simple_dir_operations;
@@ -1626,6 +1627,7 @@ static void __exit exit_nfsd(void)
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+MODULE_DESCRIPTION("In-kernel NFS server");
MODULE_LICENSE("GPL");
module_init(init_nfsd)
module_exit(exit_nfsd)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index d88498f8b275..11c14faa6c67 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -96,7 +96,12 @@ int nfsd_pool_stats_open(struct inode *, struct file *);
int nfsd_pool_stats_release(struct inode *, struct file *);
void nfsd_shutdown_threads(struct net *net);
-void nfsd_put(struct net *net);
+static inline void nfsd_put(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ svc_put(nn->nfsd_serv);
+}
bool i_am_nfsd(void);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c291389a1d71..355bf0db3235 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -614,7 +614,7 @@ out_negative:
* @fhp: file handle to be updated
*
*/
-void fh_fill_pre_attrs(struct svc_fh *fhp)
+__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp)
{
bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
struct inode *inode;
@@ -622,12 +622,12 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
__be32 err;
if (fhp->fh_no_wcc || fhp->fh_pre_saved)
- return;
+ return nfs_ok;
inode = d_inode(fhp->fh_dentry);
err = fh_getattr(fhp, &stat);
if (err)
- return;
+ return err;
if (v4)
fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
@@ -636,6 +636,7 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
fhp->fh_pre_ctime = stat.ctime;
fhp->fh_pre_size = stat.size;
fhp->fh_pre_saved = true;
+ return nfs_ok;
}
/**
@@ -643,26 +644,27 @@ void fh_fill_pre_attrs(struct svc_fh *fhp)
* @fhp: file handle to be updated
*
*/
-void fh_fill_post_attrs(struct svc_fh *fhp)
+__be32 fh_fill_post_attrs(struct svc_fh *fhp)
{
bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
struct inode *inode = d_inode(fhp->fh_dentry);
__be32 err;
if (fhp->fh_no_wcc)
- return;
+ return nfs_ok;
if (fhp->fh_post_saved)
printk("nfsd: inode locked twice during operation.\n");
err = fh_getattr(fhp, &fhp->fh_post_attr);
if (err)
- return;
+ return err;
fhp->fh_post_saved = true;
if (v4)
fhp->fh_post_change =
nfsd4_change_attribute(&fhp->fh_post_attr, inode);
+ return nfs_ok;
}
/**
@@ -672,16 +674,20 @@ void fh_fill_post_attrs(struct svc_fh *fhp)
* This is used when the directory wasn't changed, but wcc attributes
* are needed anyway.
*/
-void fh_fill_both_attrs(struct svc_fh *fhp)
+__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp)
{
- fh_fill_post_attrs(fhp);
- if (!fhp->fh_post_saved)
- return;
+ __be32 err;
+
+ err = fh_fill_post_attrs(fhp);
+ if (err)
+ return err;
+
fhp->fh_pre_change = fhp->fh_post_change;
fhp->fh_pre_mtime = fhp->fh_post_attr.mtime;
fhp->fh_pre_ctime = fhp->fh_post_attr.ctime;
fhp->fh_pre_size = fhp->fh_post_attr.size;
fhp->fh_pre_saved = true;
+ return nfs_ok;
}
/*
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 4e0ecf0ae2cf..40426f899e76 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -294,7 +294,7 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
}
u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode);
-extern void fh_fill_pre_attrs(struct svc_fh *fhp);
-extern void fh_fill_post_attrs(struct svc_fh *fhp);
-extern void fh_fill_both_attrs(struct svc_fh *fhp);
+__be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp);
+__be32 fh_fill_post_attrs(struct svc_fh *fhp);
+__be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp);
#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2154fa63c5f2..c7af1095f6b5 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -542,9 +542,14 @@ static struct notifier_block nfsd_inet6addr_notifier = {
/* Only used under nfsd_mutex, so this atomic may be overkill: */
static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
-static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
+static void nfsd_last_thread(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv = nn->nfsd_serv;
+
+ spin_lock(&nfsd_notifier_lock);
+ nn->nfsd_serv = NULL;
+ spin_unlock(&nfsd_notifier_lock);
/* check if the notifier still has clients */
if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
@@ -554,6 +559,8 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
#endif
}
+ svc_xprt_destroy_all(serv, net);
+
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
@@ -644,7 +651,8 @@ void nfsd_shutdown_threads(struct net *net)
svc_get(serv);
/* Kill outstanding nfsd threads */
svc_set_num_threads(serv, NULL, 0);
- nfsd_put(net);
+ nfsd_last_thread(net);
+ svc_put(serv);
mutex_unlock(&nfsd_mutex);
}
@@ -674,9 +682,6 @@ int nfsd_create_serv(struct net *net)
serv->sv_maxconn = nn->max_connections;
error = svc_bind(serv, net);
if (error < 0) {
- /* NOT nfsd_put() as notifiers (see below) haven't
- * been set up yet.
- */
svc_put(serv);
return error;
}
@@ -719,29 +724,6 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
return 0;
}
-/* This is the callback for kref_put() below.
- * There is no code here as the first thing to be done is
- * call svc_shutdown_net(), but we cannot get the 'net' from
- * the kref. So do all the work when kref_put returns true.
- */
-static void nfsd_noop(struct kref *ref)
-{
-}
-
-void nfsd_put(struct net *net)
-{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
- if (kref_put(&nn->nfsd_serv->sv_refcnt, nfsd_noop)) {
- svc_xprt_destroy_all(nn->nfsd_serv, net);
- nfsd_last_thread(nn->nfsd_serv, net);
- svc_destroy(&nn->nfsd_serv->sv_refcnt);
- spin_lock(&nfsd_notifier_lock);
- nn->nfsd_serv = NULL;
- spin_unlock(&nfsd_notifier_lock);
- }
-}
-
int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
{
int i = 0;
@@ -792,7 +774,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
if (err)
break;
}
- nfsd_put(net);
+ svc_put(nn->nfsd_serv);
return err;
}
@@ -807,6 +789,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
int error;
bool nfsd_up_before;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv;
mutex_lock(&nfsd_mutex);
dprintk("nfsd: creating service\n");
@@ -826,22 +809,25 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
goto out;
nfsd_up_before = nn->nfsd_net_up;
+ serv = nn->nfsd_serv;
error = nfsd_startup_net(net, cred);
if (error)
goto out_put;
- error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
+ error = svc_set_num_threads(serv, NULL, nrservs);
if (error)
goto out_shutdown;
- error = nn->nfsd_serv->sv_nrthreads;
+ error = serv->sv_nrthreads;
+ if (error == 0)
+ nfsd_last_thread(net);
out_shutdown:
if (error < 0 && !nfsd_up_before)
nfsd_shutdown_net(net);
out_put:
/* Threads now hold service active */
if (xchg(&nn->keep_active, 0))
- nfsd_put(net);
- nfsd_put(net);
+ svc_put(serv);
+ svc_put(serv);
out:
mutex_unlock(&nfsd_mutex);
return error;
@@ -953,7 +939,6 @@ nfsd(void *vrqstp)
struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
struct net *net = perm_sock->xpt_net;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- int err;
/* At this point, the thread shares current->fs
* with the init process. We need to create files with the
@@ -965,15 +950,6 @@ nfsd(void *vrqstp)
current->fs->umask = 0;
- /*
- * thread is spawned with all signals set to SIG_IGN, re-enable
- * the ones that will bring down the thread
- */
- allow_signal(SIGKILL);
- allow_signal(SIGHUP);
- allow_signal(SIGINT);
- allow_signal(SIGQUIT);
-
atomic_inc(&nfsdstats.th_cnt);
set_freezable();
@@ -981,54 +957,19 @@ nfsd(void *vrqstp)
/*
* The main request loop
*/
- for (;;) {
+ while (!kthread_should_stop()) {
/* Update sv_maxconn if it has changed */
rqstp->rq_server->sv_maxconn = nn->max_connections;
- /*
- * Find a socket with data available and call its
- * recvfrom routine.
- */
- while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
- ;
- if (err == -EINTR)
- break;
- validate_process_creds();
- svc_process(rqstp);
+ svc_recv(rqstp);
validate_process_creds();
}
- /* Clear signals before calling svc_exit_thread() */
- flush_signals(current);
-
atomic_dec(&nfsdstats.th_cnt);
out:
- /* Take an extra ref so that the svc_put in svc_exit_thread()
- * doesn't call svc_destroy()
- */
- svc_get(nn->nfsd_serv);
-
/* Release the thread */
svc_exit_thread(rqstp);
-
- /* We need to drop a ref, but may not drop the last reference
- * without holding nfsd_mutex, and we cannot wait for nfsd_mutex as that
- * could deadlock with nfsd_shutdown_threads() waiting for us.
- * So three options are:
- * - drop a non-final reference,
- * - get the mutex without waiting
- * - sleep briefly andd try the above again
- */
- while (!svc_put_not_last(nn->nfsd_serv)) {
- if (mutex_trylock(&nfsd_mutex)) {
- nfsd_put(net);
- mutex_unlock(&nfsd_mutex);
- break;
- }
- msleep(20);
- }
-
return 0;
}
@@ -1046,6 +987,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
{
const struct svc_procedure *proc = rqstp->rq_procinfo;
__be32 *statp = rqstp->rq_accept_statp;
+ struct nfsd_cacherep *rp;
/*
* Give the xdr decoder a chance to change this if it wants
@@ -1056,7 +998,8 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
goto out_decode_err;
- switch (nfsd_cache_lookup(rqstp)) {
+ rp = NULL;
+ switch (nfsd_cache_lookup(rqstp, &rp)) {
case RC_DOIT:
break;
case RC_REPLY:
@@ -1072,7 +1015,7 @@ int nfsd_dispatch(struct svc_rqst *rqstp)
if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
goto out_encode_err;
- nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
+ nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1);
out_cached_reply:
return 1;
@@ -1082,13 +1025,13 @@ out_decode_err:
return 1;
out_update_drop:
- nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL);
out_dropit:
return 0;
out_encode_err:
trace_nfsd_cant_encode_err(rqstp);
- nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ nfsd_cache_update(rqstp, rp, RC_NOCACHE, NULL);
*statp = rpc_system_err;
return 1;
}
@@ -1139,11 +1082,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
int nfsd_pool_stats_release(struct inode *inode, struct file *file)
{
+ struct seq_file *seq = file->private_data;
+ struct svc_serv *serv = seq->private;
int ret = seq_release(inode, file);
- struct net *net = inode->i_sb->s_fs_info;
mutex_lock(&nfsd_mutex);
- nfsd_put(net);
+ svc_put(serv);
mutex_unlock(&nfsd_mutex);
return ret;
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d49d3060ed4f..cbddcf484dba 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -732,4 +732,7 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
return clp->cl_state == NFSD4_EXPIRABLE;
}
+
+extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
+ struct inode *inode);
#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 777e24e5da33..63797635e1c3 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -65,6 +65,8 @@ static int nfsd_show(struct seq_file *seq, void *v)
seq_printf(seq, " %lld",
percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
}
+ seq_printf(seq, "\nwdeleg_getattr %lld",
+ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]));
seq_putc(seq, '\n');
#endif
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 9b43dc3d9991..cf5524e7ca06 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -22,6 +22,7 @@ enum {
NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */
NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op))
+ NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */
#endif
NFSD_STATS_COUNTERS_NUM
};
@@ -93,4 +94,10 @@ static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
}
+#ifdef CONFIG_NFSD_V4
+static inline void nfsd_stats_wdeleg_getattr_inc(void)
+{
+ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]);
+}
+#endif
#endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 2af74983f146..803904348871 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -607,6 +607,7 @@ DEFINE_STATEID_EVENT(layout_recall_release);
DEFINE_STATEID_EVENT(open);
DEFINE_STATEID_EVENT(deleg_read);
+DEFINE_STATEID_EVENT(deleg_write);
DEFINE_STATEID_EVENT(deleg_return);
DEFINE_STATEID_EVENT(deleg_recall);
@@ -1240,8 +1241,8 @@ TRACE_EVENT(nfsd_drc_found,
TRACE_EVENT(nfsd_drc_mismatch,
TP_PROTO(
const struct nfsd_net *nn,
- const struct svc_cacherep *key,
- const struct svc_cacherep *rp
+ const struct nfsd_cacherep *key,
+ const struct nfsd_cacherep *rp
),
TP_ARGS(nn, key, rp),
TP_STRUCT__entry(
@@ -1261,6 +1262,28 @@ TRACE_EVENT(nfsd_drc_mismatch,
__entry->ingress)
);
+TRACE_EVENT_CONDITION(nfsd_drc_gc,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ unsigned long freed
+ ),
+ TP_ARGS(nn, freed),
+ TP_CONDITION(freed > 0),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(unsigned long, freed)
+ __field(int, total)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->freed = freed;
+ __entry->total = atomic_read(&nn->num_drc_entries);
+ ),
+ TP_printk("boot_time=%16llx total=%d freed=%lu",
+ __entry->boot_time, __entry->total, __entry->freed
+ )
+);
+
TRACE_EVENT(nfsd_cb_args,
TP_PROTO(
const struct nfs4_client *clp,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2c9074ab2315..48260cf68fde 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -520,7 +520,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_sanitize_attrs(inode, iap);
- if (check_guard && guardtime != inode->i_ctime.tv_sec)
+ if (check_guard && guardtime != inode_get_ctime(inode).tv_sec)
return nfserr_notsync;
/*
@@ -1540,7 +1540,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
dput(dchild);
if (err)
goto out_unlock;
- fh_fill_pre_attrs(fhp);
+ err = fh_fill_pre_attrs(fhp);
+ if (err != nfs_ok)
+ goto out_unlock;
err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
fh_fill_post_attrs(fhp);
out_unlock:
@@ -1635,13 +1637,16 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
inode_unlock(dentry->d_inode);
goto out_drop_write;
}
- fh_fill_pre_attrs(fhp);
+ err = fh_fill_pre_attrs(fhp);
+ if (err != nfs_ok)
+ goto out_unlock;
host_err = vfs_symlink(&nop_mnt_idmap, d_inode(dentry), dnew, path);
err = nfserrno(host_err);
cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
if (!err)
nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
fh_fill_post_attrs(fhp);
+out_unlock:
inode_unlock(dentry->d_inode);
if (!err)
err = nfserrno(commit_metadata(fhp));
@@ -1703,7 +1708,9 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
err = nfserr_noent;
if (d_really_is_negative(dold))
goto out_dput;
- fh_fill_pre_attrs(ffhp);
+ err = fh_fill_pre_attrs(ffhp);
+ if (err != nfs_ok)
+ goto out_dput;
host_err = vfs_link(dold, &nop_mnt_idmap, dirp, dnew, NULL);
fh_fill_post_attrs(ffhp);
inode_unlock(dirp);
@@ -1789,8 +1796,12 @@ retry:
}
trap = lock_rename(tdentry, fdentry);
- fh_fill_pre_attrs(ffhp);
- fh_fill_pre_attrs(tfhp);
+ err = fh_fill_pre_attrs(ffhp);
+ if (err != nfs_ok)
+ goto out_unlock;
+ err = fh_fill_pre_attrs(tfhp);
+ if (err != nfs_ok)
+ goto out_unlock;
odentry = lookup_one_len(fname, fdentry, flen);
host_err = PTR_ERR(odentry);
@@ -1857,6 +1868,7 @@ retry:
fh_fill_post_attrs(ffhp);
fh_fill_post_attrs(tfhp);
}
+out_unlock:
unlock_rename(tdentry, fdentry);
fh_drop_write(ffhp);
@@ -1916,12 +1928,14 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
goto out_unlock;
}
rinode = d_inode(rdentry);
- ihold(rinode);
+ err = fh_fill_pre_attrs(fhp);
+ if (err != nfs_ok)
+ goto out_unlock;
+ ihold(rinode);
if (!type)
type = d_inode(rdentry)->i_mode & S_IFMT;
- fh_fill_pre_attrs(fhp);
if (type != S_IFDIR) {
int retries;
@@ -2341,16 +2355,18 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
return nfserrno(ret);
inode_lock(fhp->fh_dentry->d_inode);
- fh_fill_pre_attrs(fhp);
-
+ err = fh_fill_pre_attrs(fhp);
+ if (err != nfs_ok)
+ goto out_unlock;
ret = __vfs_removexattr_locked(&nop_mnt_idmap, fhp->fh_dentry,
name, NULL);
-
+ err = nfsd_xattr_errno(ret);
fh_fill_post_attrs(fhp);
+out_unlock:
inode_unlock(fhp->fh_dentry->d_inode);
fh_drop_write(fhp);
- return nfsd_xattr_errno(ret);
+ return err;
}
__be32
@@ -2368,15 +2384,17 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
if (ret)
return nfserrno(ret);
inode_lock(fhp->fh_dentry->d_inode);
- fh_fill_pre_attrs(fhp);
-
- ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry, name, buf,
- len, flags, NULL);
+ err = fh_fill_pre_attrs(fhp);
+ if (err != nfs_ok)
+ goto out_unlock;
+ ret = __vfs_setxattr_locked(&nop_mnt_idmap, fhp->fh_dentry,
+ name, buf, len, flags, NULL);
fh_fill_post_attrs(fhp);
+ err = nfsd_xattr_errno(ret);
+out_unlock:
inode_unlock(fhp->fh_dentry->d_inode);
fh_drop_write(fhp);
-
- return nfsd_xattr_errno(ret);
+ return err;
}
#endif
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 510978e602da..9d918a79dc16 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -774,17 +774,6 @@ void warn_on_nonidempotent_op(struct nfsd4_op *op);
#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
-static inline void
-set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
-{
- BUG_ON(!fhp->fh_pre_saved);
- cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr);
-
- cinfo->before_change = fhp->fh_pre_change;
- cinfo->after_change = fhp->fh_post_change;
-}
-
-
bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 7d59567465e1..7dae168e346e 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NILFS2_FS
tristate "NILFS2 file system support"
+ select BUFFER_HEAD
select CRC32
select LEGACY_DIRECT_IO
help
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 6ce8617b562d..7342de296ec3 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -205,7 +205,8 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
int ret;
spin_lock(lock);
- if (prev->bh && blkoff == prev->blkoff) {
+ if (prev->bh && blkoff == prev->blkoff &&
+ likely(buffer_uptodate(prev->bh))) {
get_bh(prev->bh);
*bhp = prev->bh;
spin_unlock(lock);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index decd6471300b..bce734b68f08 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, mapping, from, to);
nilfs_put_page(page);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
}
/*
@@ -519,7 +519,7 @@ got_it:
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
nilfs_commit_chunk(page, page->mapping, from, to);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
nilfs_mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
@@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
nilfs_commit_chunk(page, mapping, from, to);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
out:
nilfs_put_page(page);
return err;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index a9eb3487efb2..740ce26d1e76 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -108,7 +108,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
wait_for_stable_page(page);
out:
sb_end_pagefault(inode->i_sb);
- return block_page_mkwrite_return(ret);
+ return vmf_fs_error(ret);
}
static const struct vm_operations_struct nilfs_file_vm_ops = {
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 48fe71d309cb..8beb2730929d 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
- if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
- brelse(bh);
+ if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */
goto failed;
- }
}
lock_buffer(bh);
@@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
failed:
unlock_page(bh->b_page);
put_page(bh->b_page);
+ if (unlikely(err))
+ brelse(bh);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 35bc79305318..1a8bd5993476 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
atomic64_inc(&root->inodes_count);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
@@ -450,10 +450,10 @@ int nilfs_read_inode_common(struct inode *inode,
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le64_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
- inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
+ inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime),
+ le32_to_cpu(raw_inode->i_ctime_nsec));
inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
- inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
return -EIO; /* this inode is for metadata and corrupted */
@@ -768,9 +768,9 @@ void nilfs_write_inode_common(struct inode *inode,
raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le64(inode->i_size);
- raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
- raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
@@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode)
nilfs_truncate_bmap(ii, blkoff);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
@@ -1025,7 +1025,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
int err;
spin_lock(&nilfs->ns_inode_lock);
- if (ii->i_bh == NULL) {
+ if (ii->i_bh == NULL || unlikely(!buffer_uptodate(ii->i_bh))) {
spin_unlock(&nilfs->ns_inode_lock);
err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
inode->i_ino, pbh);
@@ -1034,7 +1034,10 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
spin_lock(&nilfs->ns_inode_lock);
if (ii->i_bh == NULL)
ii->i_bh = *pbh;
- else {
+ else if (unlikely(!buffer_uptodate(ii->i_bh))) {
+ __brelse(ii->i_bh);
+ ii->i_bh = *pbh;
+ } else {
brelse(*pbh);
*pbh = ii->i_bh;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 1dfbc0c34513..40ffade49f38 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -149,7 +149,7 @@ int nilfs_fileattr_set(struct mnt_idmap *idmap,
NILFS_I(inode)->i_flags = oldflags | (flags & FS_FL_USER_MODIFIABLE);
nilfs_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index c7024da8f1e2..2a4e7f4a8102 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -185,7 +185,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
if (err)
return err;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
drop_nlink(inode);
err = 0;
out:
@@ -387,7 +387,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
goto out_dir;
nilfs_set_link(new_dir, new_de, new_page, old_inode);
nilfs_mark_inode_dirty(new_dir);
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
drop_nlink(new_inode);
@@ -406,7 +406,7 @@ static int nilfs_rename(struct mnt_idmap *idmap,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
nilfs_delete_entry(old_de, old_page);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 581691e4be49..7ec16879756e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -725,6 +725,11 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct folio *folio = fbatch.folios[i];
folio_lock(folio);
+ if (unlikely(folio->mapping != mapping)) {
+ /* Exclude folios removed from the address space */
+ folio_unlock(folio);
+ continue;
+ }
head = folio_buffers(folio);
if (!head) {
create_empty_buffers(&folio->page, i_blocksize(inode), 0);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0ef8c71bde8e..a5d1fa4e7552 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -35,6 +35,7 @@
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
+#include <linux/fs_context.h>
#include "nilfs.h"
#include "export.h"
#include "mdt.h"
@@ -1216,7 +1217,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
}
struct nilfs_super_data {
- struct block_device *bdev;
__u64 cno;
int flags;
};
@@ -1283,64 +1283,49 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
static int nilfs_set_bdev_super(struct super_block *s, void *data)
{
- s->s_bdev = data;
- s->s_dev = s->s_bdev->bd_dev;
+ s->s_dev = *(dev_t *)data;
return 0;
}
static int nilfs_test_bdev_super(struct super_block *s, void *data)
{
- return (void *)s->s_bdev == data;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}
static struct dentry *
nilfs_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
- struct nilfs_super_data sd;
+ struct nilfs_super_data sd = { .flags = flags };
struct super_block *s;
- struct dentry *root_dentry;
- int err, s_new = false;
+ dev_t dev;
+ int err;
- sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type,
- NULL);
- if (IS_ERR(sd.bdev))
- return ERR_CAST(sd.bdev);
+ if (nilfs_identify(data, &sd))
+ return ERR_PTR(-EINVAL);
- sd.cno = 0;
- sd.flags = flags;
- if (nilfs_identify((char *)data, &sd)) {
- err = -EINVAL;
- goto failed;
- }
+ err = lookup_bdev(dev_name, &dev);
+ if (err)
+ return ERR_PTR(err);
- /*
- * once the super is inserted into the list by sget, s_umount
- * will protect the lockfs code from trying to start a snapshot
- * while we are mounting
- */
- mutex_lock(&sd.bdev->bd_fsfreeze_mutex);
- if (sd.bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
- err = -EBUSY;
- goto failed;
- }
s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
- sd.bdev);
- mutex_unlock(&sd.bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s)) {
- err = PTR_ERR(s);
- goto failed;
- }
+ &dev);
+ if (IS_ERR(s))
+ return ERR_CAST(s);
if (!s->s_root) {
- s_new = true;
-
- /* New superblock instance created */
- snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
- sb_set_blocksize(s, block_size(sd.bdev));
-
- err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0);
+ /*
+ * We drop s_umount here because we need to open the bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * __invalidate_device()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
+ */
+ up_write(&s->s_umount);
+ err = setup_bdev_super(s, flags, NULL);
+ down_write(&s->s_umount);
+ if (!err)
+ err = nilfs_fill_super(s, data,
+ flags & SB_SILENT ? 1 : 0);
if (err)
goto failed_super;
@@ -1366,24 +1351,18 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
}
if (sd.cno) {
+ struct dentry *root_dentry;
+
err = nilfs_attach_snapshot(s, sd.cno, &root_dentry);
if (err)
goto failed_super;
- } else {
- root_dentry = dget(s->s_root);
+ return root_dentry;
}
- if (!s_new)
- blkdev_put(sd.bdev, fs_type);
-
- return root_dentry;
+ return dget(s->s_root);
failed_super:
deactivate_locked_super(s);
-
- failed:
- if (!s_new)
- blkdev_put(sd.bdev, fs_type);
return ERR_PTR(err);
}
diff --git a/fs/nls/Kconfig b/fs/nls/Kconfig
index c7857e36adbb..2a601af6f3bd 100644
--- a/fs/nls/Kconfig
+++ b/fs/nls/Kconfig
@@ -617,4 +617,7 @@ config NLS_UTF8
input/output character sets. Say Y here for the UTF-8 encoding of
the Unicode/ISO9646 universal character set.
+config NLS_UCS2_UTILS
+ tristate
+
endif # NLS
diff --git a/fs/nls/Makefile b/fs/nls/Makefile
index ac54db297128..5062c699d041 100644
--- a/fs/nls/Makefile
+++ b/fs/nls/Makefile
@@ -54,3 +54,4 @@ obj-$(CONFIG_NLS_MAC_INUIT) += mac-inuit.o
obj-$(CONFIG_NLS_MAC_ROMANIAN) += mac-romanian.o
obj-$(CONFIG_NLS_MAC_ROMAN) += mac-roman.o
obj-$(CONFIG_NLS_MAC_TURKISH) += mac-turkish.o
+obj-$(CONFIG_NLS_UCS2_UTILS) += nls_ucs2_utils.o
diff --git a/fs/nls/nls_ucs2_data.h b/fs/nls/nls_ucs2_data.h
new file mode 100644
index 000000000000..1f454dc0f4e0
--- /dev/null
+++ b/fs/nls/nls_ucs2_data.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _NLS_UCS2_DATA_H
+#define _NLS_UCS2_DATA_H
+
+struct UniCaseRange {
+ wchar_t start;
+ wchar_t end;
+ signed char *table;
+};
+
+extern signed char NlsUniUpperTable[512];
+extern const struct UniCaseRange NlsUniUpperRange[];
+
+#endif /* _NLS_UCS2_DATA_H */
diff --git a/fs/smb/server/uniupr.h b/fs/nls/nls_ucs2_utils.c
index 26583b776897..a69781c54dd8 100644
--- a/fs/smb/server/uniupr.h
+++ b/fs/nls/nls_ucs2_utils.c
@@ -1,19 +1,27 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Some of the source code in this file came from fs/cifs/uniupr.h
* Copyright (c) International Business Machines Corp., 2000,2002
*
- * uniupr.h - Unicode compressed case ranges
+ * Some of the source code in this file came from fs/cifs/cifs_unicode.c
+ *
+ * Copyright (c) International Business Machines Corp., 2000,2009
+ * Modified by Steve French (sfrench@us.ibm.com)
+ * Modified by Namjae Jeon (linkinjeon@kernel.org)
*
*/
-#ifndef __KSMBD_UNIUPR_H
-#define __KSMBD_UNIUPR_H
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+#include "nls_ucs2_utils.h"
+
+MODULE_LICENSE("GPL");
-#ifndef UNIUPR_NOUPPER
/*
* Latin upper case
*/
-signed char SmbUniUpperTable[512] = {
+signed char NlsUniUpperTable[512] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
@@ -51,6 +59,7 @@ signed char SmbUniUpperTable[512] = {
0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */
0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */
};
+EXPORT_SYMBOL_GPL(NlsUniUpperTable);
/* Upper case range - Greek */
static signed char UniCaseRangeU03a0[47] = {
@@ -126,7 +135,7 @@ static signed char UniCaseRangeUff40[27] = {
/*
* Upper Case Range
*/
-const struct UniCaseRange SmbUniUpperRange[] = {
+const struct UniCaseRange NlsUniUpperRange[] = {
{0x03a0, 0x03ce, UniCaseRangeU03a0},
{0x0430, 0x045f, UniCaseRangeU0430},
{0x0490, 0x04cc, UniCaseRangeU0490},
@@ -134,135 +143,4 @@ const struct UniCaseRange SmbUniUpperRange[] = {
{0xff40, 0xff5a, UniCaseRangeUff40},
{0}
};
-#endif
-
-#ifndef UNIUPR_NOLOWER
-/*
- * Latin lower case
- */
-signed char CifsUniLowerTable[512] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */
- 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, /* 040-04f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0,
- 0, 0, 0, /* 050-05f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, 32, /* 0c0-0cf */
- 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32,
- 32, 32, 32, 0, /* 0d0-0df */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */
- 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */
- 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0,
- 0, /* 170-17f */
- 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79,
- 0, /* 180-18f */
- 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */
- 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */
- 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */
- 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */
- 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */
-};
-
-/* Lower case range - Greek */
-static signed char UniCaseRangeL0380[44] = {
- 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */
- 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, /* 390-39f */
- 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-};
-
-/* Lower case range - Cyrillic */
-static signed char UniCaseRangeL0400[48] = {
- 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80,
- 0, 80, 80, /* 400-40f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, /* 410-41f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, 32, 32, /* 420-42f */
-};
-
-/* Lower case range - Extended cyrillic */
-static signed char UniCaseRangeL0490[60] = {
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */
- 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-};
-
-/* Lower case range - Extended latin and greek */
-static signed char UniCaseRangeL1e00[504] = {
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */
- 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */
- 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0,
- 0, 0, /* 1fc0-1fcf */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0,
- 0, 0, /* 1fe0-1fef */
- 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-/* Lower case range - Wide latin */
-static signed char UniCaseRangeLff20[27] = {
- 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
- 32, /* ff20-ff2f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-};
-
-/*
- * Lower Case Range
- */
-const struct UniCaseRange CifsUniLowerRange[] = {
- {0x0380, 0x03ab, UniCaseRangeL0380},
- {0x0400, 0x042f, UniCaseRangeL0400},
- {0x0490, 0x04cb, UniCaseRangeL0490},
- {0x1e00, 0x1ff7, UniCaseRangeL1e00},
- {0xff20, 0xff3a, UniCaseRangeLff20},
- {0}
-};
-#endif
-
-#endif /* __KSMBD_UNIUPR_H */
+EXPORT_SYMBOL_GPL(NlsUniUpperRange);
diff --git a/fs/nls/nls_ucs2_utils.h b/fs/nls/nls_ucs2_utils.h
new file mode 100644
index 000000000000..ef18d30db1d0
--- /dev/null
+++ b/fs/nls/nls_ucs2_utils.h
@@ -0,0 +1,285 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Some of the source code in this file came from fs/cifs/cifs_unicode.c
+ * and then via server/unicode.c
+ * cifs_unicode: Unicode kernel case support
+ *
+ * Function:
+ * Convert a unicode character to upper or lower case using
+ * compressed tables.
+ *
+ * Copyright (c) International Business Machines Corp., 2000,2009
+ *
+ *
+ * Notes:
+ * These APIs are based on the C library functions. The semantics
+ * should match the C functions but with expanded size operands.
+ *
+ * The upper/lower functions are based on a table created by mkupr.
+ * This is a compressed table of upper and lower case conversion.
+ *
+ */
+#ifndef _NLS_UCS2_UTILS_H
+#define _NLS_UCS2_UTILS_H
+
+#include <asm/byteorder.h>
+#include <linux/types.h>
+#include <linux/nls.h>
+#include <linux/unicode.h>
+#include "nls_ucs2_data.h"
+
+/*
+ * Windows maps these to the user defined 16 bit Unicode range since they are
+ * reserved symbols (along with \ and /), otherwise illegal to store
+ * in filenames in NTFS
+ */
+#define UNI_ASTERISK ((__u16)('*' + 0xF000))
+#define UNI_QUESTION ((__u16)('?' + 0xF000))
+#define UNI_COLON ((__u16)(':' + 0xF000))
+#define UNI_GRTRTHAN ((__u16)('>' + 0xF000))
+#define UNI_LESSTHAN ((__u16)('<' + 0xF000))
+#define UNI_PIPE ((__u16)('|' + 0xF000))
+#define UNI_SLASH ((__u16)('\\' + 0xF000))
+
+/*
+ * UniStrcat: Concatenate the second string to the first
+ *
+ * Returns:
+ * Address of the first string
+ */
+static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2)
+{
+ wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */
+
+ while (*ucs1++)
+ /*NULL*/; /* To end of first string */
+ ucs1--; /* Return to the null */
+ while ((*ucs1++ = *ucs2++))
+ /*NULL*/; /* copy string 2 over */
+ return anchor;
+}
+
+/*
+ * UniStrchr: Find a character in a string
+ *
+ * Returns:
+ * Address of first occurrence of character in string
+ * or NULL if the character is not in the string
+ */
+static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc)
+{
+ while ((*ucs != uc) && *ucs)
+ ucs++;
+
+ if (*ucs == uc)
+ return (wchar_t *)ucs;
+ return NULL;
+}
+
+/*
+ * UniStrcmp: Compare two strings
+ *
+ * Returns:
+ * < 0: First string is less than second
+ * = 0: Strings are equal
+ * > 0: First string is greater than second
+ */
+static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+ while ((*ucs1 == *ucs2) && *ucs1) {
+ ucs1++;
+ ucs2++;
+ }
+ return (int)*ucs1 - (int)*ucs2;
+}
+
+/*
+ * UniStrcpy: Copy a string
+ */
+static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2)
+{
+ wchar_t *anchor = ucs1; /* save the start of result string */
+
+ while ((*ucs1++ = *ucs2++))
+ /*NULL*/;
+ return anchor;
+}
+
+/*
+ * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes)
+ */
+static inline size_t UniStrlen(const wchar_t *ucs1)
+{
+ int i = 0;
+
+ while (*ucs1++)
+ i++;
+ return i;
+}
+
+/*
+ * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a
+ * string (length limited)
+ */
+static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen)
+{
+ int i = 0;
+
+ while (*ucs1++) {
+ i++;
+ if (i >= maxlen)
+ break;
+ }
+ return i;
+}
+
+/*
+ * UniStrncat: Concatenate length limited string
+ */
+static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ wchar_t *anchor = ucs1; /* save pointer to string 1 */
+
+ while (*ucs1++)
+ /*NULL*/;
+ ucs1--; /* point to null terminator of s1 */
+ while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */
+ ucs1++;
+ ucs2++;
+ }
+ *ucs1 = 0; /* Null terminate the result */
+ return anchor;
+}
+
+/*
+ * UniStrncmp: Compare length limited string
+ */
+static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ if (!n)
+ return 0; /* Null strings are equal */
+ while ((*ucs1 == *ucs2) && *ucs1 && --n) {
+ ucs1++;
+ ucs2++;
+ }
+ return (int)*ucs1 - (int)*ucs2;
+}
+
+/*
+ * UniStrncmp_le: Compare length limited string - native to little-endian
+ */
+static inline int
+UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ if (!n)
+ return 0; /* Null strings are equal */
+ while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
+ ucs1++;
+ ucs2++;
+ }
+ return (int)*ucs1 - (int)__le16_to_cpu(*ucs2);
+}
+
+/*
+ * UniStrncpy: Copy length limited string with pad
+ */
+static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ wchar_t *anchor = ucs1;
+
+ while (n-- && *ucs2) /* Copy the strings */
+ *ucs1++ = *ucs2++;
+
+ n++;
+ while (n--) /* Pad with nulls */
+ *ucs1++ = 0;
+ return anchor;
+}
+
+/*
+ * UniStrncpy_le: Copy length limited string with pad to little-endian
+ */
+static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
+{
+ wchar_t *anchor = ucs1;
+
+ while (n-- && *ucs2) /* Copy the strings */
+ *ucs1++ = __le16_to_cpu(*ucs2++);
+
+ n++;
+ while (n--) /* Pad with nulls */
+ *ucs1++ = 0;
+ return anchor;
+}
+
+/*
+ * UniStrstr: Find a string in a string
+ *
+ * Returns:
+ * Address of first match found
+ * NULL if no matching string is found
+ */
+static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2)
+{
+ const wchar_t *anchor1 = ucs1;
+ const wchar_t *anchor2 = ucs2;
+
+ while (*ucs1) {
+ if (*ucs1 == *ucs2) {
+ /* Partial match found */
+ ucs1++;
+ ucs2++;
+ } else {
+ if (!*ucs2) /* Match found */
+ return (wchar_t *)anchor1;
+ ucs1 = ++anchor1; /* No match */
+ ucs2 = anchor2;
+ }
+ }
+
+ if (!*ucs2) /* Both end together */
+ return (wchar_t *)anchor1; /* Match found */
+ return NULL; /* No match */
+}
+
+#ifndef UNIUPR_NOUPPER
+/*
+ * UniToupper: Convert a unicode character to upper case
+ */
+static inline wchar_t UniToupper(register wchar_t uc)
+{
+ register const struct UniCaseRange *rp;
+
+ if (uc < sizeof(NlsUniUpperTable)) {
+ /* Latin characters */
+ return uc + NlsUniUpperTable[uc]; /* Use base tables */
+ }
+
+ rp = NlsUniUpperRange; /* Use range tables */
+ while (rp->start) {
+ if (uc < rp->start) /* Before start of range */
+ return uc; /* Uppercase = input */
+ if (uc <= rp->end) /* In range */
+ return uc + rp->table[uc - rp->start];
+ rp++; /* Try next range */
+ }
+ return uc; /* Past last range */
+}
+
+/*
+ * UniStrupr: Upper case a unicode string
+ */
+static inline __le16 *UniStrupr(register __le16 *upin)
+{
+ register __le16 *up;
+
+ up = upin;
+ while (*up) { /* For all characters */
+ *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
+ up++;
+ }
+ return upin; /* Return input pointer */
+}
+#endif /* UNIUPR_NOUPPER */
+
+#endif /* _NLS_UCS2_UTILS_H */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 190aa717fa32..ebdcc25df0f7 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -199,7 +199,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
}
/* this conversion is done only at watch creation */
-static __u32 convert_arg(unsigned long arg)
+static __u32 convert_arg(unsigned int arg)
{
__u32 new_mask = FS_EVENT_ON_CHILD;
@@ -258,7 +258,7 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
* up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be
* attached to the fsnotify_mark.
*/
-int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
{
struct dnotify_mark *new_dn_mark, *dn_mark;
struct fsnotify_mark *new_fsn_mark, *fsn_mark;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index f69c451018e3..62fe0b679e58 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1585,16 +1585,25 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
}
/* Check if filesystem can encode a unique fid */
-static int fanotify_test_fid(struct dentry *dentry)
+static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+ const struct export_operations *nop = dentry->d_sb->s_export_op;
+
+ /*
+ * We need to make sure that the filesystem supports encoding of
+ * file handles so user can use name_to_handle_at() to compare fids
+ * reported with events to the file handle of watched objects.
+ */
+ if (!nop)
+ return -EOPNOTSUPP;
+
/*
- * We need to make sure that the file system supports at least
- * encoding a file handle so user can use name_to_handle_at() to
- * compare fid returned with event to the file handle of watched
- * objects. However, even the relaxed AT_HANDLE_FID flag requires
- * at least empty export_operations for ecoding unique file ids.
+ * For sb/mount mark, we also need to make sure that the filesystem
+ * supports decoding file handles, so user has a way to map back the
+ * reported fids to filesystem objects.
*/
- if (!dentry->d_sb->s_export_op)
+ if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry)
return -EOPNOTSUPP;
return 0;
@@ -1812,7 +1821,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto path_put_and_out;
- ret = fanotify_test_fid(path.dentry);
+ ret = fanotify_test_fid(path.dentry, flags);
if (ret)
goto path_put_and_out;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index f602a96a1afe..647a22433bd8 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -84,7 +84,7 @@ slow:
return -ENOMEM;
}
inode->i_ino = ns->inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_flags |= S_IMMUTABLE;
inode->i_mode = S_IFREG | S_IRUGO;
inode->i_fop = &ns_file_operations;
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
index f93e69a61283..7b2509741735 100644
--- a/fs/ntfs/Kconfig
+++ b/fs/ntfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NTFS_FS
tristate "NTFS file system support"
+ select BUFFER_HEAD
select NLS
help
NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 6c3f38d66579..99ac6ea277c4 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -654,7 +654,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
* always changes, when mtime is changed. ctime can be changed on its
* own, mtime is then not changed, e.g. when a file is renamed.
*/
- vi->i_ctime = ntfs2utc(si->last_mft_change_time);
+ inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time));
/*
* Last access to the data within the file. Not changed during a rename
* for example but changed whenever the file is written to.
@@ -1218,7 +1218,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
vi->i_mtime = base_vi->i_mtime;
- vi->i_ctime = base_vi->i_ctime;
+ inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
vi->i_atime = base_vi->i_atime;
vi->i_generation = ni->seq_no = base_ni->seq_no;
@@ -1484,7 +1484,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
vi->i_gid = base_vi->i_gid;
set_nlink(vi, base_vi->i_nlink);
vi->i_mtime = base_vi->i_mtime;
- vi->i_ctime = base_vi->i_ctime;
+ inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
vi->i_atime = base_vi->i_atime;
vi->i_generation = ni->seq_no = base_ni->seq_no;
/* Set inode type to zero but preserve permissions. */
@@ -2804,13 +2804,14 @@ done:
*/
if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
struct timespec64 now = current_time(VFS_I(base_ni));
+ struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
int sync_it = 0;
if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
- !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now))
+ !timespec64_equal(&ctime, &now))
sync_it = 1;
+ inode_set_ctime_to_ts(VFS_I(base_ni), now);
VFS_I(base_ni)->i_mtime = now;
- VFS_I(base_ni)->i_ctime = now;
if (sync_it)
mark_inode_dirty_sync(VFS_I(base_ni));
@@ -2928,7 +2929,7 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (ia_valid & ATTR_MTIME)
vi->i_mtime = attr->ia_mtime;
if (ia_valid & ATTR_CTIME)
- vi->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(vi, attr->ia_ctime);
mark_inode_dirty(vi);
out:
return err;
@@ -3004,7 +3005,7 @@ int __ntfs_write_inode(struct inode *vi, int sync)
si->last_data_change_time = nt;
modified = true;
}
- nt = utc2ntfs(vi->i_ctime);
+ nt = utc2ntfs(inode_get_ctime(vi));
if (si->last_mft_change_time != nt) {
ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
"new = 0x%llx", vi->i_ino, (long long)
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 0155f106ec34..ad1a8f72da22 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2682,8 +2682,7 @@ mft_rec_already_initialized:
vi->i_mode &= ~S_IWUGO;
/* Set the inode times to the current time. */
- vi->i_atime = vi->i_mtime = vi->i_ctime =
- current_time(vi);
+ vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi);
/*
* Set the file size to 0, the ntfs inode sizes are set to 0 by
* the call to ntfs_init_big_inode() below.
diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig
index 96cc236f7f7b..cdfdf51e55d7 100644
--- a/fs/ntfs3/Kconfig
+++ b/fs/ntfs3/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config NTFS3_FS
tristate "NTFS Read-Write file system support"
+ select BUFFER_HEAD
select NLS
select LEGACY_DIRECT_IO
help
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index a9d82bbb4729..63f70259edc0 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -1106,10 +1106,10 @@ repack:
}
}
- /*
+ /*
* The code below may require additional cluster (to extend attribute list)
- * and / or one MFT record
- * It is too complex to undo operations if -ENOSPC occurs deep inside
+ * and / or one MFT record
+ * It is too complex to undo operations if -ENOSPC occurs deep inside
* in 'ni_insert_nonresident'.
* Return in advance -ENOSPC here if there are no free cluster and no free MFT.
*/
@@ -1736,10 +1736,8 @@ repack:
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
0, NULL, &mi_b);
- if (!attr_b) {
- err = -ENOENT;
- goto out;
- }
+ if (!attr_b)
+ return -ENOENT;
attr = attr_b;
le = le_b;
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index 42631b31adf1..7c01735d1219 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -52,7 +52,8 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
if (!attr->non_res) {
lsize = le32_to_cpu(attr->res.data_size);
- le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN);
+ /* attr is resident: lsize < record_size (1K or 4K) */
+ le = kvmalloc(al_aligned(lsize), GFP_KERNEL);
if (!le) {
err = -ENOMEM;
goto out;
@@ -80,7 +81,17 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
if (err < 0)
goto out;
- le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN);
+ /* attr is nonresident.
+ * The worst case:
+ * 1T (2^40) extremely fragmented file.
+ * cluster = 4K (2^12) => 2^28 fragments
+ * 2^9 fragments per one record => 2^19 records
+ * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes.
+ *
+ * the result is 16M bytes per attribute list.
+ * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes]
+ */
+ le = kvmalloc(al_aligned(lsize), GFP_KERNEL);
if (!le) {
err = -ENOMEM;
goto out;
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 107e808e06ea..63f14a0232f6 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -125,6 +125,7 @@ void wnd_close(struct wnd_bitmap *wnd)
struct rb_node *node, *next;
kfree(wnd->free_bits);
+ wnd->free_bits = NULL;
run_close(&wnd->run);
node = rb_first(&wnd->start_tree);
@@ -659,7 +660,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits)
wnd->bits_last = wbits;
wnd->free_bits =
- kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
+ kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO);
+
if (!wnd->free_bits)
return -ENOMEM;
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index 063a6654199b..ec0566b322d5 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -309,7 +309,11 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
}
- dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+ /* NTFS: symlinks are "dir + reparse" or "file + reparse" */
+ if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
+ dt_type = DT_LNK;
+ else
+ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
}
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 1d6c824246c4..1f7a194983c5 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -85,7 +85,7 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED;
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
stat->result_mask |= STATX_BTIME;
stat->btime = ni->i_crtime;
@@ -342,7 +342,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
err = 0;
}
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
if (IS_SYNC(inode)) {
@@ -400,7 +400,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
ni_unlock(ni);
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (!IS_DIRSYNC(inode)) {
dirty = 1;
} else {
@@ -642,7 +642,7 @@ out:
filemap_invalidate_unlock(mapping);
if (!err) {
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
@@ -745,8 +745,8 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len, unsigned int flags)
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
struct inode *inode = in->f_mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 16bd9faa2d28..dad976a68985 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -2148,7 +2148,7 @@ out1:
for (i = 0; i < pages_per_frame; i++) {
pg = pages[i];
- if (i == idx)
+ if (i == idx || !pg)
continue;
unlock_page(pg);
put_page(pg);
@@ -3208,6 +3208,12 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup)))
continue;
+ /* Check simple case when parent inode equals current inode. */
+ if (ino_get(&fname->home) == ni->vfs_inode.i_ino) {
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ continue;
+ }
+
/* ntfs_iget5 may sleep. */
dir = ntfs_iget5(sb, &fname->home, NULL);
if (IS_ERR(dir)) {
@@ -3265,6 +3271,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_rec_inuse(ni->mi.mrec) &&
!(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
bool modified = false;
+ struct timespec64 ctime = inode_get_ctime(inode);
/* Update times in standard attribute. */
std = ni_std(ni);
@@ -3280,7 +3287,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
modified = true;
}
- dup.c_time = kernel2nt(&inode->i_ctime);
+ dup.c_time = kernel2nt(&ctime);
if (std->c_time != dup.c_time) {
std->c_time = dup.c_time;
modified = true;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 12f28cdf5c83..98ccb6650858 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -2168,8 +2168,10 @@ file_is_valid:
if (!page) {
page = kmalloc(log->page_size, GFP_NOFS);
- if (!page)
- return -ENOMEM;
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
}
/*
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 33afee0f5559..fbfe21dbb425 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -983,18 +983,11 @@ out:
if (err)
return err;
- mark_inode_dirty(&ni->vfs_inode);
+ mark_inode_dirty_sync(&ni->vfs_inode);
/* verify(!ntfs_update_mftmirr()); */
- /*
- * If we used wait=1, sync_inode_metadata waits for the io for the
- * inode to finish. It hangs when media is removed.
- * So wait=0 is sent down to sync_inode_metadata
- * and filemap_fdatawrite is used for the data blocks.
- */
- err = sync_inode_metadata(&ni->vfs_inode, 0);
- if (!err)
- err = filemap_fdatawrite(ni->vfs_inode.i_mapping);
+ /* write mft record on disk. */
+ err = _ni_write_inode(&ni->vfs_inode, 1);
return err;
}
@@ -2461,10 +2454,12 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
{
CLST end, i, zone_len, zlen;
struct wnd_bitmap *wnd = &sbi->used.bitmap;
+ bool dirty = false;
down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
if (!wnd_is_used(wnd, lcn, len)) {
- ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ /* mark volume as dirty out of wnd->rw_lock */
+ dirty = true;
end = lcn + len;
len = 0;
@@ -2518,6 +2513,8 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
out:
up_write(&wnd->rw_lock);
+ if (dirty)
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}
/*
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 124c6e822623..cf92b2433f7a 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -729,6 +729,9 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
u32 total = le32_to_cpu(hdr->total);
u16 offs[128];
+ if (unlikely(!cmp))
+ return NULL;
+
fill_table:
if (end > total)
return NULL;
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index dc7e7ab701c6..d6d021e19aaa 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -44,6 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
u64 t64;
struct MFT_REC *rec;
struct runs_tree *run;
+ struct timespec64 ctime;
inode->i_op = NULL;
/* Setup 'uid' and 'gid' */
@@ -169,7 +170,8 @@ next_attr:
nt2kernel(std5->cr_time, &ni->i_crtime);
#endif
nt2kernel(std5->a_time, &inode->i_atime);
- nt2kernel(std5->c_time, &inode->i_ctime);
+ nt2kernel(std5->c_time, &ctime);
+ inode_set_ctime_to_ts(inode, ctime);
nt2kernel(std5->m_time, &inode->i_mtime);
ni->std_fa = std5->fa;
@@ -554,7 +556,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
struct super_block *sb = inode->i_sb;
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
u8 cluster_bits = sbi->cluster_bits;
u32 block_size = sb->s_blocksize;
u64 bytes, lbo, valid;
@@ -569,7 +571,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
if (is_resident(ni)) {
ni_lock(ni);
- err = attr_data_read_resident(ni, page);
+ err = attr_data_read_resident(ni, &folio->page);
ni_unlock(ni);
if (!err)
@@ -642,17 +644,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
*/
bytes = block_size;
- if (page) {
+ if (folio) {
u32 voff = valid - vbo;
bh->b_size = block_size;
off = vbo & (PAGE_SIZE - 1);
- set_bh_page(bh, page, off);
+ folio_set_bh(bh, folio, off);
err = bh_read(bh, 0);
if (err < 0)
goto out;
- zero_user_segment(page, off + voff, off + block_size);
+ folio_zero_segment(folio, off + voff, off + block_size);
}
}
@@ -958,7 +960,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
if (err >= 0) {
if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
dirty = true;
}
@@ -1658,8 +1660,9 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
/* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */
- inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime =
- dir->i_ctime = ni->i_crtime;
+ inode->i_atime = inode->i_mtime =
+ inode_set_ctime_to_ts(inode, ni->i_crtime);
+ dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime);
mark_inode_dirty(dir);
mark_inode_dirty(inode);
@@ -1765,9 +1768,9 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
if (!err) {
drop_nlink(inode);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
if (inode->i_nlink)
mark_inode_dirty(inode);
} else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) {
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 70f8c859e0ad..eedacf94edd8 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
err = ntfs_link_inode(inode, de);
if (!err) {
- dir->i_ctime = dir->i_mtime = inode->i_ctime =
- current_time(dir);
+ dir->i_mtime = inode_set_ctime_to_ts(
+ inode, inode_set_ctime_current(dir));
mark_inode_dirty(inode);
mark_inode_dirty(dir);
d_instantiate(de, inode);
@@ -324,14 +324,11 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
/* Restore after failed rename failed too. */
_ntfs_bad_inode(inode);
} else if (!err) {
- inode->i_ctime = dir->i_ctime = dir->i_mtime =
- current_time(dir);
+ simple_rename_timestamp(dir, dentry, new_dir, new_dentry);
mark_inode_dirty(inode);
mark_inode_dirty(dir);
- if (dir != new_dir) {
- new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime;
+ if (dir != new_dir)
mark_inode_dirty(new_dir);
- }
if (IS_DIRSYNC(dir))
ntfs_sync_inode(dir);
@@ -376,7 +373,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
if (IS_POSIXACL(dir)) {
- /*
+ /*
* Load in cache current acl to avoid ni_lock(dir):
* ntfs_create_inode -> ntfs_init_acl -> posix_acl_create ->
* ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 98b76d1b09e7..86aecbb01a92 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -847,7 +847,7 @@ struct OBJECT_ID {
// Birth Volume Id is the Object Id of the Volume on.
// which the Object Id was allocated. It never changes.
struct GUID BirthVolumeId; //0x10:
-
+
// Birth Object Id is the first Object Id that was
// ever assigned to this MFT Record. I.e. If the Object Id
// is changed for some reason, this field will reflect the
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 629403ede6e5..0e6a2777870c 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -42,9 +42,11 @@ enum utf16_endian;
#define MINUS_ONE_T ((size_t)(-1))
/* Biggest MFT / smallest cluster */
#define MAXIMUM_BYTES_PER_MFT 4096
+#define MAXIMUM_SHIFT_BYTES_PER_MFT 12
#define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512)
#define MAXIMUM_BYTES_PER_INDEX 4096
+#define MAXIMUM_SHIFT_BYTES_PER_INDEX 12
#define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512)
/* NTFS specific error code when fixup failed. */
@@ -495,8 +497,6 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, u32 flags);
int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len);
int ntfs_file_open(struct inode *inode, struct file *file);
int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index c12ebffc94da..53629b1f65e9 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -189,12 +189,19 @@ out:
return err;
}
+/*
+ * mi_enum_attr - start/continue attributes enumeration in record.
+ *
+ * NOTE: mi->mrec - memory of size sbi->record_size
+ * here we sure that mi->mrec->total == sbi->record_size (see mi_read)
+ */
struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
{
const struct MFT_REC *rec = mi->mrec;
u32 used = le32_to_cpu(rec->used);
- u32 t32, off, asize;
+ u32 t32, off, asize, prev_type;
u16 t16;
+ u64 data_size, alloc_size, tot_size;
if (!attr) {
u32 total = le32_to_cpu(rec->total);
@@ -213,6 +220,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
if (!is_rec_inuse(rec))
return NULL;
+ prev_type = 0;
attr = Add2Ptr(rec, off);
} else {
/* Check if input attr inside record. */
@@ -226,11 +234,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
return NULL;
}
- if (off + asize < off) {
- /* Overflow check. */
+ /* Overflow check. */
+ if (off + asize < off)
return NULL;
- }
+ prev_type = le32_to_cpu(attr->type);
attr = Add2Ptr(attr, asize);
off += asize;
}
@@ -250,7 +258,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
/* 0x100 is last known attribute for now. */
t32 = le32_to_cpu(attr->type);
- if ((t32 & 0xf) || (t32 > 0x100))
+ if (!t32 || (t32 & 0xf) || (t32 > 0x100))
+ return NULL;
+
+ /* attributes in record must be ordered by type */
+ if (t32 < prev_type)
return NULL;
/* Check overflow and boundary. */
@@ -259,16 +271,15 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
/* Check size of attribute. */
if (!attr->non_res) {
+ /* Check resident fields. */
if (asize < SIZEOF_RESIDENT)
return NULL;
t16 = le16_to_cpu(attr->res.data_off);
-
if (t16 > asize)
return NULL;
- t32 = le32_to_cpu(attr->res.data_size);
- if (t16 + t32 > asize)
+ if (t16 + le32_to_cpu(attr->res.data_size) > asize)
return NULL;
t32 = sizeof(short) * attr->name_len;
@@ -278,21 +289,52 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
return attr;
}
- /* Check some nonresident fields. */
- if (attr->name_len &&
- le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len >
- le16_to_cpu(attr->nres.run_off)) {
+ /* Check nonresident fields. */
+ if (attr->non_res != 1)
+ return NULL;
+
+ t16 = le16_to_cpu(attr->nres.run_off);
+ if (t16 > asize)
+ return NULL;
+
+ t32 = sizeof(short) * attr->name_len;
+ if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
+ return NULL;
+
+ /* Check start/end vcn. */
+ if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1)
return NULL;
- }
- if (attr->nres.svcn || !is_attr_ext(attr)) {
+ data_size = le64_to_cpu(attr->nres.data_size);
+ if (le64_to_cpu(attr->nres.valid_size) > data_size)
+ return NULL;
+
+ alloc_size = le64_to_cpu(attr->nres.alloc_size);
+ if (data_size > alloc_size)
+ return NULL;
+
+ t32 = mi->sbi->cluster_mask;
+ if (alloc_size & t32)
+ return NULL;
+
+ if (!attr->nres.svcn && is_attr_ext(attr)) {
+ /* First segment of sparse/compressed attribute */
+ if (asize + 8 < SIZEOF_NONRESIDENT_EX)
+ return NULL;
+
+ tot_size = le64_to_cpu(attr->nres.total_size);
+ if (tot_size & t32)
+ return NULL;
+
+ if (tot_size > alloc_size)
+ return NULL;
+ } else {
if (asize + 8 < SIZEOF_NONRESIDENT)
return NULL;
if (attr->nres.c_unit)
return NULL;
- } else if (asize + 8 < SIZEOF_NONRESIDENT_EX)
- return NULL;
+ }
return attr;
}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 1a02072b6b0e..f763e3256ccc 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -453,15 +453,23 @@ static struct proc_dir_entry *proc_info_root;
* ntfs3.1
* cluster size
* number of clusters
+ * total number of mft records
+ * number of used mft records ~= number of files + folders
+ * real state of ntfs "dirty"/"clean"
+ * current state of ntfs "dirty"/"clean"
*/
static int ntfs3_volinfo(struct seq_file *m, void *o)
{
struct super_block *sb = m->private;
struct ntfs_sb_info *sbi = sb->s_fs_info;
- seq_printf(m, "ntfs%d.%d\n%u\n%zu\n", sbi->volume.major_ver,
- sbi->volume.minor_ver, sbi->cluster_size,
- sbi->used.bitmap.nbits);
+ seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n",
+ sbi->volume.major_ver, sbi->volume.minor_ver,
+ sbi->cluster_size, sbi->used.bitmap.nbits,
+ sbi->mft.bitmap.nbits,
+ sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap),
+ sbi->volume.real_dirty ? "dirty" : "clean",
+ (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean");
return 0;
}
@@ -488,9 +496,13 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer,
{
int err;
struct super_block *sb = pde_data(file_inode(file));
- struct ntfs_sb_info *sbi = sb->s_fs_info;
ssize_t ret = count;
- u8 *label = kmalloc(count, GFP_NOFS);
+ u8 *label;
+
+ if (sb_rdonly(sb))
+ return -EROFS;
+
+ label = kmalloc(count, GFP_NOFS);
if (!label)
return -ENOMEM;
@@ -502,7 +514,7 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer,
while (ret > 0 && label[ret - 1] == '\n')
ret -= 1;
- err = ntfs_set_label(sbi, label, ret);
+ err = ntfs_set_label(sb->s_fs_info, label, ret);
if (err < 0) {
ntfs_err(sb, "failed (%d) to write label", err);
@@ -569,31 +581,37 @@ static void init_once(void *foo)
}
/*
- * put_ntfs - Noinline to reduce binary size.
+ * Noinline to reduce binary size.
*/
-static noinline void put_ntfs(struct ntfs_sb_info *sbi)
+static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi)
{
- kfree(sbi->new_rec);
- kvfree(ntfs_put_shared(sbi->upcase));
- kfree(sbi->def_table);
-
wnd_close(&sbi->mft.bitmap);
wnd_close(&sbi->used.bitmap);
- if (sbi->mft.ni)
+ if (sbi->mft.ni) {
iput(&sbi->mft.ni->vfs_inode);
+ sbi->mft.ni = NULL;
+ }
- if (sbi->security.ni)
+ if (sbi->security.ni) {
iput(&sbi->security.ni->vfs_inode);
+ sbi->security.ni = NULL;
+ }
- if (sbi->reparse.ni)
+ if (sbi->reparse.ni) {
iput(&sbi->reparse.ni->vfs_inode);
+ sbi->reparse.ni = NULL;
+ }
- if (sbi->objid.ni)
+ if (sbi->objid.ni) {
iput(&sbi->objid.ni->vfs_inode);
+ sbi->objid.ni = NULL;
+ }
- if (sbi->volume.ni)
+ if (sbi->volume.ni) {
iput(&sbi->volume.ni->vfs_inode);
+ sbi->volume.ni = NULL;
+ }
ntfs_update_mftmirr(sbi, 0);
@@ -601,6 +619,13 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi)
indx_clear(&sbi->security.index_sdh);
indx_clear(&sbi->reparse.index_r);
indx_clear(&sbi->objid.index_o);
+}
+
+static void ntfs3_free_sbi(struct ntfs_sb_info *sbi)
+{
+ kfree(sbi->new_rec);
+ kvfree(ntfs_put_shared(sbi->upcase));
+ kfree(sbi->def_table);
kfree(sbi->compress.lznt);
#ifdef CONFIG_NTFS3_LZX_XPRESS
xpress_free_decompressor(sbi->compress.xpress);
@@ -625,12 +650,7 @@ static void ntfs_put_super(struct super_block *sb)
/* Mark rw ntfs as clear, if possible. */
ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
-
- put_mount_options(sbi->options);
- put_ntfs(sbi);
- sb->s_fs_info = NULL;
-
- sync_blockdev(sb->s_bdev);
+ ntfs3_put_sbi(sbi);
}
static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -838,7 +858,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
struct ntfs_sb_info *sbi = sb->s_fs_info;
int err;
u32 mb, gb, boot_sector_size, sct_per_clst, record_size;
- u64 sectors, clusters, mlcn, mlcn2;
+ u64 sectors, clusters, mlcn, mlcn2, dev_size0;
struct NTFS_BOOT *boot;
struct buffer_head *bh;
struct MFT_REC *rec;
@@ -847,6 +867,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
u32 boot_off = 0;
const char *hint = "Primary boot";
+ /* Save original dev_size. Used with alternative boot. */
+ dev_size0 = dev_size;
+
sbi->volume.blocks = dev_size >> PAGE_SHIFT;
bh = ntfs_bread(sb, 0);
@@ -855,6 +878,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
check_boot:
err = -EINVAL;
+
+ /* Corrupted image; do not read OOB */
+ if (bh->b_size - sizeof(*boot) < boot_off)
+ goto out;
+
boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off);
if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) {
@@ -901,9 +929,17 @@ check_boot:
goto out;
}
- sbi->record_size = record_size =
- boot->record_size < 0 ? 1 << (-boot->record_size) :
- (u32)boot->record_size << cluster_bits;
+ if (boot->record_size >= 0) {
+ record_size = (u32)boot->record_size << cluster_bits;
+ } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) {
+ record_size = 1u << (-boot->record_size);
+ } else {
+ ntfs_err(sb, "%s: invalid record size %d.", hint,
+ boot->record_size);
+ goto out;
+ }
+
+ sbi->record_size = record_size;
sbi->record_bits = blksize_bits(record_size);
sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
@@ -920,9 +956,15 @@ check_boot:
goto out;
}
- sbi->index_size = boot->index_size < 0 ?
- 1u << (-boot->index_size) :
- (u32)boot->index_size << cluster_bits;
+ if (boot->index_size >= 0) {
+ sbi->index_size = (u32)boot->index_size << cluster_bits;
+ } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) {
+ sbi->index_size = 1u << (-boot->index_size);
+ } else {
+ ntfs_err(sb, "%s: invalid index size %d.", hint,
+ boot->index_size);
+ goto out;
+ }
/* Check index record size. */
if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) {
@@ -1057,17 +1099,17 @@ check_boot:
if (bh->b_blocknr && !sb_rdonly(sb)) {
/*
- * Alternative boot is ok but primary is not ok.
- * Do not update primary boot here 'cause it may be faked boot.
- * Let ntfs to be mounted and update boot later.
- */
+ * Alternative boot is ok but primary is not ok.
+ * Do not update primary boot here 'cause it may be faked boot.
+ * Let ntfs to be mounted and update boot later.
+ */
*boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN);
}
out:
- if (err == -EINVAL && !bh->b_blocknr && dev_size > PAGE_SHIFT) {
+ if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) {
u32 block_size = min_t(u32, sector_size, PAGE_SIZE);
- u64 lbo = dev_size - sizeof(*boot);
+ u64 lbo = dev_size0 - sizeof(*boot);
/*
* Try alternative boot (last sector)
@@ -1081,6 +1123,7 @@ out:
boot_off = lbo & (block_size - 1);
hint = "Alternative boot";
+ dev_size = dev_size0; /* restore original size. */
goto check_boot;
}
brelse(bh);
@@ -1369,7 +1412,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
}
bytes = inode->i_size;
- sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN);
+ sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL);
if (!t) {
err = -ENOMEM;
goto put_inode_out;
@@ -1523,9 +1566,9 @@ load_root:
if (boot2) {
/*
- * Alternative boot is ok but primary is not ok.
- * Volume is recognized as NTFS. Update primary boot.
- */
+ * Alternative boot is ok but primary is not ok.
+ * Volume is recognized as NTFS. Update primary boot.
+ */
struct buffer_head *bh0 = sb_getblk(sb, 0);
if (bh0) {
if (buffer_locked(bh0))
@@ -1564,15 +1607,9 @@ load_root:
put_inode_out:
iput(inode);
out:
- /*
- * Free resources here.
- * ntfs_fs_free will be called with fc->s_fs_info = NULL
- */
- put_mount_options(sbi->options);
- put_ntfs(sbi);
- sb->s_fs_info = NULL;
+ ntfs3_put_sbi(sbi);
kfree(boot2);
-
+ ntfs3_put_sbi(sbi);
return err;
}
@@ -1658,8 +1695,10 @@ static void ntfs_fs_free(struct fs_context *fc)
struct ntfs_mount_options *opts = fc->fs_private;
struct ntfs_sb_info *sbi = fc->s_fs_info;
- if (sbi)
- put_ntfs(sbi);
+ if (sbi) {
+ ntfs3_put_sbi(sbi);
+ ntfs3_free_sbi(sbi);
+ }
if (opts)
put_mount_options(opts);
@@ -1728,13 +1767,24 @@ free_opts:
return -ENOMEM;
}
+static void ntfs3_kill_sb(struct super_block *sb)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ kill_block_super(sb);
+
+ if (sbi->options)
+ put_mount_options(sbi->options);
+ ntfs3_free_sbi(sbi);
+}
+
// clang-format off
static struct file_system_type ntfs_fs_type = {
.owner = THIS_MODULE,
.name = "ntfs3",
.init_fs_context = ntfs_init_fs_context,
.parameters = ntfs_fs_parameters,
- .kill_sb = kill_block_super,
+ .kill_sb = ntfs3_kill_sb,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
// clang-format on
@@ -1753,7 +1803,6 @@ static int __init init_ntfs_fs(void)
if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
pr_info("ntfs3: Read-only LZX/Xpress compression included\n");
-
#ifdef CONFIG_PROC_FS
/* Create "/proc/fs/ntfs3" */
proc_info_root = proc_mkdir("fs/ntfs3", NULL);
@@ -1795,7 +1844,6 @@ static void __exit exit_ntfs_fs(void)
if (proc_info_root)
remove_proc_entry("fs/ntfs3", NULL);
#endif
-
}
MODULE_LICENSE("GPL");
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 023f314e8950..4920548192a0 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -211,7 +211,8 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
size = le32_to_cpu(info->size);
/* Enumerate all xattrs. */
- for (ret = 0, off = 0; off < size; off += ea_size) {
+ ret = 0;
+ for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) {
ea = Add2Ptr(ea_all, off);
ea_size = unpacked_ea_size(ea);
@@ -219,6 +220,10 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
break;
if (buffer) {
+ /* Check if we can use field ea->name */
+ if (off + ea_size > size)
+ break;
+
if (ret + ea->name_len + 1 > bytes_per_buffer) {
err = -ERANGE;
goto out;
@@ -637,7 +642,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
if (!err) {
set_cached_acl(inode, type, acl);
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
@@ -924,7 +929,7 @@ set_new_fa:
NULL);
out:
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return err;
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 3123da7cfb30..2514d36cbe01 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -2,6 +2,7 @@
config OCFS2_FS
tristate "OCFS2 file system support"
depends on INET && SYSFS && CONFIGFS_FS
+ select BUFFER_HEAD
select JBD2
select CRC32
select QUOTA
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 9fd03eaf15f8..e75137a8e7cb 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -191,10 +191,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_mode = new_mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
di->i_mode = cpu_to_le16(inode->i_mode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 51c93929a146..aef58f1395c8 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7436,10 +7436,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8dfc284e85f0..0fdba30740ab 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2048,7 +2048,7 @@ out_write_size:
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
if (handle)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 35c05c18de59..bc27301eab6d 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -44,17 +44,17 @@ static LIST_HEAD(send_tracking);
void o2net_debug_add_nst(struct o2net_send_tracking *nst)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
list_add(&nst->st_net_debug_item, &send_tracking);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
void o2net_debug_del_nst(struct o2net_send_tracking *nst)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
if (!list_empty(&nst->st_net_debug_item))
list_del_init(&nst->st_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
static struct o2net_send_tracking
@@ -84,9 +84,9 @@ static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
nst = next_nst(dummy_nst);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return nst;
}
@@ -95,13 +95,13 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct o2net_send_tracking *nst, *dummy_nst = seq->private;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
nst = next_nst(dummy_nst);
list_del_init(&dummy_nst->st_net_debug_item);
if (nst)
list_add(&dummy_nst->st_net_debug_item,
&nst->st_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return nst; /* unused, just needs to be null when done */
}
@@ -112,7 +112,7 @@ static int nst_seq_show(struct seq_file *seq, void *v)
ktime_t now;
s64 sock, send, status;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
nst = next_nst(dummy_nst);
if (!nst)
goto out;
@@ -145,7 +145,7 @@ static int nst_seq_show(struct seq_file *seq, void *v)
(long long)status);
out:
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return 0;
}
@@ -191,16 +191,16 @@ static const struct file_operations nst_seq_fops = {
void o2net_debug_add_sc(struct o2net_sock_container *sc)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
list_add(&sc->sc_net_debug_item, &sock_containers);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
void o2net_debug_del_sc(struct o2net_sock_container *sc)
{
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
list_del_init(&sc->sc_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
}
struct o2net_sock_debug {
@@ -236,9 +236,9 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
sc = next_sc(dummy_sc);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return sc;
}
@@ -248,12 +248,12 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
sc = next_sc(dummy_sc);
list_del_init(&dummy_sc->sc_net_debug_item);
if (sc)
list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return sc; /* unused, just needs to be null when done */
}
@@ -349,7 +349,7 @@ static int sc_seq_show(struct seq_file *seq, void *v)
struct o2net_sock_debug *sd = seq->private;
struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock;
- spin_lock(&o2net_debug_lock);
+ spin_lock_bh(&o2net_debug_lock);
sc = next_sc(dummy_sc);
if (sc) {
@@ -359,7 +359,7 @@ static int sc_seq_show(struct seq_file *seq, void *v)
sc_show_sock_stats(seq, sc);
}
- spin_unlock(&o2net_debug_lock);
+ spin_unlock_bh(&o2net_debug_lock);
return 0;
}
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 189c111bc371..15d0ed9c13e5 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -93,7 +93,7 @@ static void o2quo_make_decision(struct work_struct *work)
int lowest_hb, lowest_reachable = 0, fence = 0;
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES);
if (lowest_hb != O2NM_MAX_NODES)
@@ -146,14 +146,14 @@ static void o2quo_make_decision(struct work_struct *work)
out:
if (fence) {
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
o2quo_fence_self();
} else {
mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
"connected: %d, lowest: %d (%sreachable)\n",
qs->qs_heartbeating, qs->qs_connected, lowest_hb,
lowest_reachable ? "" : "un");
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
@@ -196,7 +196,7 @@ void o2quo_hb_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
qs->qs_heartbeating++;
mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES,
@@ -211,7 +211,7 @@ void o2quo_hb_up(u8 node)
else
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* hb going down releases any holds we might have had due to this node from
@@ -220,7 +220,7 @@ void o2quo_hb_down(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
qs->qs_heartbeating--;
mlog_bug_on_msg(qs->qs_heartbeating < 0,
@@ -233,7 +233,7 @@ void o2quo_hb_down(u8 node)
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* this tells us that we've decided that the node is still heartbeating
@@ -245,14 +245,14 @@ void o2quo_hb_still_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
mlog(0, "node %u\n", node);
qs->qs_pending = 1;
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* This is analogous to hb_up. as a node's connection comes up we delay the
@@ -264,7 +264,7 @@ void o2quo_conn_up(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
qs->qs_connected++;
mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES,
@@ -279,7 +279,7 @@ void o2quo_conn_up(u8 node)
else
o2quo_clear_hold(qs, node);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
/* we've decided that we won't ever be connecting to the node again. if it's
@@ -290,7 +290,7 @@ void o2quo_conn_err(u8 node)
{
struct o2quo_state *qs = &o2quo_state;
- spin_lock(&qs->qs_lock);
+ spin_lock_bh(&qs->qs_lock);
if (test_bit(node, qs->qs_conn_bm)) {
qs->qs_connected--;
@@ -307,7 +307,7 @@ void o2quo_conn_err(u8 node)
mlog(0, "node %u, %d total\n", node, qs->qs_connected);
- spin_unlock(&qs->qs_lock);
+ spin_unlock_bh(&qs->qs_lock);
}
void o2quo_init(void)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 694471fc46b8..8b123d543e6e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1658,7 +1658,7 @@ int __ocfs2_add_entry(handle_t *handle,
offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
if (ocfs2_dirent_would_fit(de, rec_len)) {
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
if (retval < 0) {
mlog_errno(retval);
@@ -2962,11 +2962,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_dinode_new_extent_list(dir, di);
i_size_write(dir, sb->s_blocksize);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
di->i_size = cpu_to_le64(sb->s_blocksize);
- di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index ba26c5567cff..81265123ce6c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -337,7 +337,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
inc_nlink(inode);
inode->i_fop = &simple_dir_operations;
@@ -360,7 +360,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_ino = get_next_ino();
inode_init_owner(&nop_mnt_idmap, inode, parent, mode);
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
ip = DLMFS_I(inode);
ip->ip_conn = DLMFS_I(parent)->ip_conn;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c28bc983a7b1..c3e2961ee5db 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2162,6 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ctime = inode_get_ctime(inode);
lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2185,7 +2186,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
lvb->lvb_iatime_packed =
cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
lvb->lvb_ictime_packed =
- cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
+ cpu_to_be64(ocfs2_pack_timespec(&ctime));
lvb->lvb_imtime_packed =
cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
lvb->lvb_iattr = cpu_to_be32(oi->ip_attr);
@@ -2208,6 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
struct ocfs2_meta_lvb *lvb;
+ struct timespec64 ctime;
mlog_meta_lvb(0, lockres);
@@ -2238,8 +2240,9 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode)
be64_to_cpu(lvb->lvb_iatime_packed));
ocfs2_unpack_timespec(&inode->i_mtime,
be64_to_cpu(lvb->lvb_imtime_packed));
- ocfs2_unpack_timespec(&inode->i_ctime,
+ ocfs2_unpack_timespec(&ctime,
be64_to_cpu(lvb->lvb_ictime_packed));
+ inode_set_ctime_to_ts(inode, ctime);
spin_unlock(&oi->ip_lock);
return 0;
}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bf2c17ea96a0..c45596c25c66 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -232,8 +232,10 @@ int ocfs2_should_update_atime(struct inode *inode,
return 0;
if (vfsmnt->mnt_flags & MNT_RELATIME) {
+ struct timespec64 ctime = inode_get_ctime(inode);
+
if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
- (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0))
+ (timespec64_compare(&inode->i_atime, &ctime) <= 0))
return 1;
return 0;
@@ -294,7 +296,7 @@ int ocfs2_set_inode_size(handle_t *handle,
i_size_write(inode, new_i_size);
inode->i_blocks = ocfs2_inode_sector_count(inode);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
if (status < 0) {
@@ -415,12 +417,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
}
i_size_write(inode, new_i_size);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
di = (struct ocfs2_dinode *) fe_bh->b_data;
di->i_size = cpu_to_le64(new_i_size);
- di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
@@ -808,12 +810,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/* must not update i_size! */
- ret = block_commit_write(page, block_start + 1,
- block_start + 1);
- if (ret < 0)
- mlog_errno(ret);
- else
- ret = 0;
+ block_commit_write(page, block_start + 1, block_start + 1);
}
/*
@@ -824,7 +821,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
i_size_write(inode, abs_to);
inode->i_blocks = ocfs2_inode_sector_count(inode);
di->i_size = cpu_to_le64((u64)i_size_read(inode));
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
di->i_mtime_nsec = di->i_ctime_nsec;
@@ -1317,7 +1314,7 @@ int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path,
goto bail;
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
/*
* If there is inline data in the inode, the inode will normally not
* have data blocks allocated (it may have an external xattr block).
@@ -2043,7 +2040,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
goto out_inode_unlock;
}
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
if (ret < 0)
mlog_errno(ret);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index bb116c39b581..e8771600b930 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -306,8 +306,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
- inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+ inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
+ le32_to_cpu(fe->i_ctime_nsec));
if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno))
mlog(ML_ERROR,
@@ -1314,8 +1314,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_mode = cpu_to_le16(inode->i_mode);
fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
@@ -1352,8 +1352,8 @@ void ocfs2_refresh_inode(struct inode *inode,
inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec);
- inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime);
- inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec);
+ inode_set_ctime(inode, le64_to_cpu(fe->i_ctime),
+ le32_to_cpu(fe->i_ctime_nsec));
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 25d8072ccfce..ce215565d061 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -114,9 +114,9 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
if (osb->replay_map)
return 0;
- replay_map = kzalloc(sizeof(struct ocfs2_replay_map) +
- (osb->max_slots * sizeof(char)), GFP_KERNEL);
-
+ replay_map = kzalloc(struct_size(replay_map, rm_replay_slots,
+ osb->max_slots),
+ GFP_KERNEL);
if (!replay_map) {
mlog_errno(-ENOMEM);
return -ENOMEM;
@@ -178,16 +178,13 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)
osb->recovery_thread_task = NULL;
init_waitqueue_head(&osb->recovery_event);
- rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
- osb->max_slots * sizeof(unsigned int),
+ rm = kzalloc(struct_size(rm, rm_entries, osb->max_slots),
GFP_KERNEL);
if (!rm) {
mlog_errno(-ENOMEM);
return -ENOMEM;
}
- rm->rm_entries = (unsigned int *)((char *)rm +
- sizeof(struct ocfs2_recovery_map));
osb->recovery_map = rm;
return 0;
@@ -557,7 +554,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
(unsigned long)bh,
(unsigned long long)bh->b_blocknr);
- ocfs2_error(bh->b_bdev->bd_super,
+ ocfs2_error(bh->b_assoc_map->host->i_sb,
"JBD2 has aborted our journal, ocfs2 cannot continue\n");
}
@@ -780,14 +777,14 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
mlog_errno(status);
if (!is_handle_aborted(handle)) {
journal_t *journal = handle->h_transaction->t_journal;
- struct super_block *sb = bh->b_bdev->bd_super;
mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. "
"Aborting transaction and journal.\n");
handle->h_err = status;
jbd2_journal_abort_handle(handle);
jbd2_journal_abort(journal, status);
- ocfs2_abort(sb, "Journal already aborted.\n");
+ ocfs2_abort(bh->b_assoc_map->host->i_sb,
+ "Journal already aborted.\n");
}
}
}
@@ -911,9 +908,9 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
/* call the kernels journal init function now */
j_journal = jbd2_journal_init_inode(inode);
- if (j_journal == NULL) {
+ if (IS_ERR(j_journal)) {
mlog(ML_ERROR, "Linux journal layer error\n");
- status = -EINVAL;
+ status = PTR_ERR(j_journal);
goto done;
}
@@ -1687,9 +1684,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
}
journal = jbd2_journal_init_inode(inode);
- if (journal == NULL) {
+ if (IS_ERR(journal)) {
mlog(ML_ERROR, "Linux journal layer error\n");
- status = -EIO;
+ status = PTR_ERR(journal);
goto done;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 41c382f68529..41c9fe7e62f9 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -29,7 +29,7 @@ struct ocfs2_dinode;
struct ocfs2_recovery_map {
unsigned int rm_used;
- unsigned int *rm_entries;
+ unsigned int rm_entries[];
};
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index b1e32ec4a9d4..05d67968a3a9 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -950,9 +950,9 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
}
di = (struct ocfs2_dinode *)di_bh->b_data;
- inode->i_ctime = current_time(inode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 17c52225b87d..5cd6d7771cea 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -793,10 +793,10 @@ static int ocfs2_link(struct dentry *old_dentry,
}
inc_nlink(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
ocfs2_set_links_count(fe, inode->i_nlink);
- fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_journal_dirty(handle, fe_bh);
err = ocfs2_add_entry(handle, dentry, inode,
@@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir,
ocfs2_set_links_count(fe, inode->i_nlink);
ocfs2_journal_dirty(handle, fe_bh);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (S_ISDIR(inode->i_mode))
drop_nlink(dir);
@@ -1535,9 +1535,13 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
status = ocfs2_add_entry(handle, new_dentry, old_inode,
OCFS2_I(old_inode)->ip_blkno,
new_dir_bh, &target_insert);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode),
@@ -1546,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
- old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec);
- old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec);
+ old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec);
+ old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec);
ocfs2_journal_dirty(handle, old_inode_bh);
} else
mlog_errno(status);
@@ -1586,9 +1590,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (new_inode) {
drop_nlink(new_inode);
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
}
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
if (update_dot_dot) {
status = ocfs2_update_entry(old_inode, handle,
@@ -1610,7 +1614,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
if (old_dir != new_dir) {
/* Keep the same times on both directories.*/
- new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime;
+ new_dir->i_mtime = inode_set_ctime_to_ts(new_dir,
+ inode_get_ctime(old_dir));
/*
* This will also pick up the i_nlink change from the
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 564ab48d03ef..25c8ec3c8c3a 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3750,9 +3750,9 @@ static int ocfs2_change_ctime(struct inode *inode,
goto out_commit;
}
- inode->i_ctime = current_time(inode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_journal_dirty(handle, di_bh);
@@ -4073,10 +4073,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
* we want mtime to appear identical to the source and
* update ctime.
*/
- t_inode->i_ctime = current_time(t_inode);
+ inode_set_ctime_current(t_inode);
- di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec);
t_inode->i_mtime = s_inode->i_mtime;
di->i_mtime = s_di->i_mtime;
@@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest,
if (newlen > i_size_read(dest))
i_size_write(dest, newlen);
spin_unlock(&OCFS2_I(dest)->ip_lock);
- dest->i_ctime = dest->i_mtime = current_time(dest);
+ dest->i_mtime = inode_set_ctime_current(dest);
ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
if (ret) {
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 05d4414d0c33..9b76ee66aeb2 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -738,18 +738,11 @@ static int user_plock(struct ocfs2_cluster_connection *conn,
*
* Internally, fs/dlm will pass these to a misc device, which
* a userspace daemon will read and write to.
- *
- * For now, cancel requests (which happen internally only),
- * are turned into unlocks. Most of this function taken from
- * gfs2_lock.
*/
- if (cmd == F_CANCELLK) {
- cmd = F_SETLK;
- fl->fl_type = F_UNLCK;
- }
-
- if (IS_GETLK(cmd))
+ if (cmd == F_CANCELLK)
+ return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl);
+ else if (IS_GETLK(cmd))
return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
else if (fl->fl_type == F_UNLCK)
return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 988d1c076861..6b906424902b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1517,8 +1517,7 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",localflocks,");
if (osb->osb_cluster_stack[0])
- seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
- OCFS2_STACK_LABEL_LEN);
+ seq_show_option(s, "cluster_stack", osb->osb_cluster_stack);
if (opts & OCFS2_MOUNT_USRQUOTA)
seq_printf(s, ",usrquota");
if (opts & OCFS2_MOUNT_GRPQUOTA)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 4ac77ff6e676..6510ad783c91 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3421,9 +3421,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
goto out;
}
- inode->i_ctime = current_time(inode);
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ inode_set_ctime_current(inode);
+ di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
}
out:
diff --git a/fs/omfs/Kconfig b/fs/omfs/Kconfig
index 42b2ec35a05b..8470f6c3e64e 100644
--- a/fs/omfs/Kconfig
+++ b/fs/omfs/Kconfig
@@ -2,6 +2,7 @@
config OMFS_FS
tristate "SonicBlue Optimized MPEG File System support"
depends on BLOCK
+ select BUFFER_HEAD
select CRC_ITU_T
help
This is the proprietary file system used by the Rio Karma music
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 82cf7e9a665f..6bda275826d6 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -143,7 +143,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode)
mark_buffer_dirty(bh);
brelse(bh);
- dir->i_ctime = current_time(dir);
+ inode_set_ctime_current(dir);
/* mark affected inodes dirty to rebuild checksums */
mark_inode_dirty(dir);
@@ -399,7 +399,7 @@ static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (err)
goto out;
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
out:
return err;
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index de8f57ee39ec..6b580b9da8e3 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -14,7 +14,7 @@ static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
{
return (sbi->s_sys_blocksize - offset -
sizeof(struct omfs_extent)) /
- sizeof(struct omfs_extent_entry) + 1;
+ sizeof(struct omfs_extent_entry);
}
void omfs_make_empty_table(struct buffer_head *bh, int offset)
@@ -24,8 +24,8 @@ void omfs_make_empty_table(struct buffer_head *bh, int offset)
oe->e_next = ~cpu_to_be64(0ULL);
oe->e_extent_count = cpu_to_be32(1),
oe->e_fill = cpu_to_be32(0x22),
- oe->e_entry.e_cluster = ~cpu_to_be64(0ULL);
- oe->e_entry.e_blocks = ~cpu_to_be64(0ULL);
+ oe->e_entry[0].e_cluster = ~cpu_to_be64(0ULL);
+ oe->e_entry[0].e_blocks = ~cpu_to_be64(0ULL);
}
int omfs_shrink_inode(struct inode *inode)
@@ -68,7 +68,7 @@ int omfs_shrink_inode(struct inode *inode)
last = next;
next = be64_to_cpu(oe->e_next);
- entry = &oe->e_entry;
+ entry = oe->e_entry;
/* ignore last entry as it is the terminator */
for (; extent_count > 1; extent_count--) {
@@ -117,7 +117,7 @@ static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
u64 *ret_block)
{
struct omfs_extent_entry *terminator;
- struct omfs_extent_entry *entry = &oe->e_entry;
+ struct omfs_extent_entry *entry = oe->e_entry;
struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
u32 extent_count = be32_to_cpu(oe->e_extent_count);
u64 new_block = 0;
@@ -245,7 +245,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
extent_count = be32_to_cpu(oe->e_extent_count);
next = be64_to_cpu(oe->e_next);
- entry = &oe->e_entry;
+ entry = oe->e_entry;
if (extent_count > max_extents)
goto out_brelse;
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index c4c79e07efc7..2f8c1882f45c 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode)
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_mapping->a_ops = &omfs_aops;
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
switch (mode & S_IFMT) {
case S_IFDIR:
inode->i_op = &omfs_dir_inops;
@@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait)
oi->i_head.h_magic = OMFS_IMAGIC;
oi->i_size = cpu_to_be64(inode->i_size);
- ctime = inode->i_ctime.tv_sec * 1000LL +
- ((inode->i_ctime.tv_nsec + 999)/1000);
+ ctime = inode_get_ctime(inode).tv_sec * 1000LL +
+ ((inode_get_ctime(inode).tv_nsec + 999)/1000);
oi->i_ctime = cpu_to_be64(ctime);
omfs_update_checksums(oi);
@@ -232,10 +232,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino)
inode->i_atime.tv_sec = ctime;
inode->i_mtime.tv_sec = ctime;
- inode->i_ctime.tv_sec = ctime;
+ inode_set_ctime(inode, ctime, nsecs);
inode->i_atime.tv_nsec = nsecs;
inode->i_mtime.tv_nsec = nsecs;
- inode->i_ctime.tv_nsec = nsecs;
inode->i_mapping->a_ops = &omfs_aops;
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index caecb3d5a344..1ff6b9e41297 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -77,7 +77,7 @@ struct omfs_extent {
__be64 e_next; /* next extent table location */
__be32 e_extent_count; /* total # extents in this table */
__be32 e_fill;
- struct omfs_extent_entry e_entry; /* start of extent entries */
+ struct omfs_extent_entry e_entry[]; /* start of extent entries */
};
#endif
diff --git a/fs/open.c b/fs/open.c
index e6ead0f19964..98f6601fbac6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -671,11 +671,20 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
return err;
}
-static int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
+static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
+ unsigned int flags)
{
struct path path;
int error;
- unsigned int lookup_flags = LOOKUP_FOLLOW;
+ unsigned int lookup_flags;
+
+ if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
+ return -EINVAL;
+
+ lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (!error) {
@@ -689,15 +698,21 @@ retry:
return error;
}
+SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
+ umode_t, mode, unsigned int, flags)
+{
+ return do_fchmodat(dfd, filename, mode, flags);
+}
+
SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
umode_t, mode)
{
- return do_fchmodat(dfd, filename, mode);
+ return do_fchmodat(dfd, filename, mode, 0);
}
SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
- return do_fchmodat(AT_FDCWD, filename, mode);
+ return do_fchmodat(AT_FDCWD, filename, mode, 0);
}
/*
@@ -1150,7 +1165,7 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
* backing_file_open - open a backing file for kernel internal use
* @path: path of the file to open
* @flags: open flags
- * @path: path of the backing file
+ * @real_path: path of the backing file
* @cred: credentials for open
*
* Open a backing file for a stackable filesystem (e.g., overlayfs).
@@ -1503,7 +1518,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
* "id" is the POSIX thread ID. We use the
* files pointer for this..
*/
-int filp_close(struct file *filp, fl_owner_t id)
+static int filp_flush(struct file *filp, fl_owner_t id)
{
int retval = 0;
@@ -1520,10 +1535,18 @@ int filp_close(struct file *filp, fl_owner_t id)
dnotify_flush(filp, id);
locks_remove_posix(filp, id);
}
- fput(filp);
return retval;
}
+int filp_close(struct file *filp, fl_owner_t id)
+{
+ int retval;
+
+ retval = filp_flush(filp, id);
+ fput(filp);
+
+ return retval;
+}
EXPORT_SYMBOL(filp_close);
/*
@@ -1533,7 +1556,20 @@ EXPORT_SYMBOL(filp_close);
*/
SYSCALL_DEFINE1(close, unsigned int, fd)
{
- int retval = close_fd(fd);
+ int retval;
+ struct file *file;
+
+ file = close_fd_get_file(fd);
+ if (!file)
+ return -EBADF;
+
+ retval = filp_flush(file, current->files);
+
+ /*
+ * We're returning to user space. Don't bother
+ * with any delayed fput() cases.
+ */
+ __fput_sync(file);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
@@ -1546,7 +1582,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
}
/**
- * close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index f0b7f4d51a17..b2457cb97fa0 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -237,7 +237,7 @@ found:
if (IS_ERR(inode))
return ERR_CAST(inode);
if (inode->i_state & I_NEW) {
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
ent_oi = OP_I(inode);
ent_oi->type = ent_type;
ent_oi->u = ent_data;
@@ -387,8 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc)
goto out_no_root;
}
- root_inode->i_mtime = root_inode->i_atime =
- root_inode->i_ctime = current_time(root_inode);
+ root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode);
root_inode->i_op = &openprom_inode_operations;
root_inode->i_fop = &openprom_operations;
root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 9014bbcc8031..085912268442 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -871,7 +871,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path,
ret = orangefs_inode_getattr(inode,
request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0);
if (ret == 0) {
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
/* override block size reported to stat */
if (!(request_mask & STATX_SIZE))
@@ -900,12 +900,13 @@ int orangefs_permission(struct mnt_idmap *idmap,
return generic_permission(&nop_mnt_idmap, inode, mask);
}
-int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags)
+int orangefs_update_time(struct inode *inode, int flags)
{
struct iattr iattr;
+
gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n",
get_khandle_from_ino(inode));
- generic_update_time(inode, time, flags);
+ flags = generic_update_time(inode, flags);
memset(&iattr, 0, sizeof iattr);
if (flags & S_ATIME)
iattr.ia_valid |= ATTR_ATIME;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 77518e248cf7..c9dfd5c6a097 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -421,7 +421,7 @@ static int orangefs_rename(struct mnt_idmap *idmap,
ret);
if (new_dentry->d_inode)
- new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode);
+ inode_set_ctime_current(d_inode(new_dentry));
op_release(new_op);
return ret;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index ce20d3443869..b711654ca18a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -370,7 +370,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path,
int orangefs_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask);
-int orangefs_update_time(struct inode *, struct timespec64 *, int);
+int orangefs_update_time(struct inode *, int);
/*
* defined in xattr.c
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 46b7dcff18ac..0a9fcfdf552f 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -361,11 +361,11 @@ again2:
downcall.resp.getattr.attributes.atime;
inode->i_mtime.tv_sec = (time64_t)new_op->
downcall.resp.getattr.attributes.mtime;
- inode->i_ctime.tv_sec = (time64_t)new_op->
- downcall.resp.getattr.attributes.ctime;
+ inode_set_ctime(inode,
+ (time64_t)new_op->downcall.resp.getattr.attributes.ctime,
+ 0);
inode->i_atime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
/* special case: mark the root inode as sticky */
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 6708e54b0e30..fec5020c3495 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -124,3 +124,12 @@ config OVERLAY_FS_METACOPY
that doesn't support this feature will have unexpected results.
If unsure, say N.
+
+config OVERLAY_FS_DEBUG
+ bool "Overlayfs: turn on extra debugging checks"
+ default n
+ depends on OVERLAY_FS
+ help
+ Say Y here to enable extra debugging checks in overlayfs.
+
+ If unsure, say N.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 568f743a5584..ada3fcc9c6d5 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry,
{
struct iattr attr = {
.ia_valid =
- ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
@@ -416,7 +416,7 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
if (is_upper)
fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER;
fh->fb.len = sizeof(fh->fb) + buflen;
- if (ofs->config.uuid)
+ if (ovl_origin_uuid(ofs))
fh->fb.uuid = *uuid;
return fh;
@@ -544,6 +544,7 @@ struct ovl_copy_up_ctx {
bool origin;
bool indexed;
bool metacopy;
+ bool metacopy_digest;
};
static int ovl_link_up(struct ovl_copy_up_ctx *c)
@@ -617,7 +618,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
if (err)
return err;
- if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) {
+ if (inode->i_flags & OVL_COPY_I_FLAGS_MASK &&
+ (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) {
/*
* Copy the fileattr inode flags that are the source of already
* copied i_flags
@@ -641,8 +643,20 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
}
if (c->metacopy) {
- err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY,
- NULL, 0, -EOPNOTSUPP);
+ struct path lowerdatapath;
+ struct ovl_metacopy metacopy_data = OVL_METACOPY_INIT;
+
+ ovl_path_lowerdata(c->dentry, &lowerdatapath);
+ if (WARN_ON_ONCE(lowerdatapath.dentry == NULL))
+ return -EIO;
+ err = ovl_get_verity_digest(ofs, &lowerdatapath, &metacopy_data);
+ if (err)
+ return err;
+
+ if (metacopy_data.digest_algo)
+ c->metacopy_digest = true;
+
+ err = ovl_set_metacopy_xattr(ofs, temp, &metacopy_data);
if (err)
return err;
}
@@ -751,9 +765,15 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
if (err)
goto cleanup;
- if (!c->metacopy)
- ovl_set_upperdata(d_inode(c->dentry));
inode = d_inode(c->dentry);
+ if (c->metacopy_digest)
+ ovl_set_flag(OVL_HAS_DIGEST, inode);
+ else
+ ovl_clear_flag(OVL_HAS_DIGEST, inode);
+ ovl_clear_flag(OVL_VERIFIED_DIGEST, inode);
+
+ if (!c->metacopy)
+ ovl_set_upperdata(inode);
ovl_inode_update(inode, temp);
if (S_ISDIR(inode->i_mode))
ovl_set_flag(OVL_WHITEOUTS, inode);
@@ -813,6 +833,12 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (err)
goto out_fput;
+ if (c->metacopy_digest)
+ ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry));
+ else
+ ovl_clear_flag(OVL_HAS_DIGEST, d_inode(c->dentry));
+ ovl_clear_flag(OVL_VERIFIED_DIGEST, d_inode(c->dentry));
+
if (!c->metacopy)
ovl_set_upperdata(d_inode(c->dentry));
ovl_inode_update(d_inode(c->dentry), dget(temp));
@@ -907,7 +933,7 @@ out:
static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
int flags)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
if (!ofs->config.metacopy)
return false;
@@ -918,6 +944,19 @@ static bool ovl_need_meta_copy_up(struct dentry *dentry, umode_t mode,
if (flags && ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)))
return false;
+ /* Fall back to full copy if no fsverity on source data and we require verity */
+ if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) {
+ struct path lowerdata;
+
+ ovl_path_lowerdata(dentry, &lowerdata);
+
+ if (WARN_ON_ONCE(lowerdata.dentry == NULL) ||
+ ovl_ensure_verity_loaded(&lowerdata) ||
+ !fsverity_active(d_inode(lowerdata.dentry))) {
+ return false;
+ }
+ }
+
return true;
}
@@ -984,6 +1023,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
if (err)
goto out_free;
+ ovl_clear_flag(OVL_HAS_DIGEST, d_inode(c->dentry));
+ ovl_clear_flag(OVL_VERIFIED_DIGEST, d_inode(c->dentry));
ovl_set_upperdata(d_inode(c->dentry));
out_free:
kfree(capability);
@@ -1078,7 +1119,7 @@ static int ovl_copy_up_flags(struct dentry *dentry, int flags)
* not very important to optimize this case, so do lazy lowerdata lookup
* before any copy up, so we can do it before taking ovl_inode_lock().
*/
- err = ovl_maybe_lookup_lowerdata(dentry);
+ err = ovl_verify_lowerdata(dentry);
if (err)
return err;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 35680b6e175b..26b782c53910 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -174,28 +174,37 @@ static int ovl_connect_layer(struct dentry *dentry)
* U = upper file handle
* L = lower file handle
*
- * (*) Connecting an overlay dir from real lower dentry is not always
+ * (*) Decoding a connected overlay dir from real lower dentry is not always
* possible when there are redirects in lower layers and non-indexed merge dirs.
* To mitigate those case, we may copy up the lower dir ancestor before encode
- * a lower dir file handle.
+ * of a decodable file handle for non-upper dir.
*
* Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
*/
static int ovl_check_encode_origin(struct dentry *dentry)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ bool decodable = ofs->config.nfs_export;
+
+ /* Lower file handle for non-upper non-decodable */
+ if (!ovl_dentry_upper(dentry) && !decodable)
+ return 1;
/* Upper file handle for pure upper */
if (!ovl_dentry_lower(dentry))
return 0;
/*
- * Upper file handle for non-indexed upper.
- *
* Root is never indexed, so if there's an upper layer, encode upper for
* root.
*/
- if (ovl_dentry_upper(dentry) &&
+ if (dentry == dentry->d_sb->s_root)
+ return 0;
+
+ /*
+ * Upper decodable file handle for non-indexed upper.
+ */
+ if (ovl_dentry_upper(dentry) && decodable &&
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
return 0;
@@ -205,7 +214,7 @@ static int ovl_check_encode_origin(struct dentry *dentry)
* ovl_connect_layer() will try to make origin's layer "connected" by
* copying up a "connectable" ancestor.
*/
- if (d_is_dir(dentry) && ovl_upper_mnt(ofs))
+ if (d_is_dir(dentry) && ovl_upper_mnt(ofs) && decodable)
return ovl_connect_layer(dentry);
/* Lower file handle for indexed and non-upper dir/non-dir */
@@ -435,7 +444,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
struct dentry *real,
const struct ovl_layer *layer)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct dentry *index = NULL;
struct dentry *this = NULL;
struct inode *inode;
@@ -656,7 +665,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb,
struct ovl_path *lowerpath,
struct dentry *index)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
const struct ovl_layer *layer = upper ? &ofs->layers[0] : lowerpath->layer;
struct dentry *real = upper ?: (index ?: lowerpath->dentry);
@@ -681,7 +690,7 @@ static struct dentry *ovl_get_dentry(struct super_block *sb,
static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
struct ovl_fh *fh)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct dentry *dentry;
struct dentry *upper;
@@ -701,7 +710,7 @@ static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
struct ovl_fh *fh)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct ovl_path origin = { };
struct ovl_path *stack = &origin;
struct dentry *dentry = NULL;
@@ -876,3 +885,8 @@ const struct export_operations ovl_export_operations = {
.get_name = ovl_get_name,
.get_parent = ovl_get_parent,
};
+
+/* encode_fh() encodes non-decodable file handles with nfs_export=off */
+const struct export_operations ovl_export_fid_operations = {
+ .encode_fh = ovl_encode_fh,
+};
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 21245b00722a..8be4dc050d1e 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -19,7 +19,6 @@ struct ovl_aio_req {
struct kiocb iocb;
refcount_t ref;
struct kiocb *orig_iocb;
- struct fd fd;
};
static struct kmem_cache *ovl_aio_request_cachep;
@@ -115,8 +114,8 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
if (allow_meta) {
ovl_path_real(dentry, &realpath);
} else {
- /* lazy lookup of lowerdata */
- err = ovl_maybe_lookup_lowerdata(dentry);
+ /* lazy lookup and verify of lowerdata */
+ err = ovl_verify_lowerdata(dentry);
if (err)
return err;
@@ -159,8 +158,8 @@ static int ovl_open(struct inode *inode, struct file *file)
struct path realpath;
int err;
- /* lazy lookup of lowerdata */
- err = ovl_maybe_lookup_lowerdata(dentry);
+ /* lazy lookup and verify lowerdata */
+ err = ovl_verify_lowerdata(dentry);
if (err)
return err;
@@ -239,6 +238,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
static void ovl_file_accessed(struct file *file)
{
struct inode *inode, *upperinode;
+ struct timespec64 ctime, uctime;
if (file->f_flags & O_NOATIME)
return;
@@ -249,10 +249,12 @@ static void ovl_file_accessed(struct file *file)
if (!upperinode)
return;
+ ctime = inode_get_ctime(inode);
+ uctime = inode_get_ctime(upperinode);
if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) ||
- !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) {
+ !timespec64_equal(&ctime, &uctime))) {
inode->i_mtime = upperinode->i_mtime;
- inode->i_ctime = upperinode->i_ctime;
+ inode_set_ctime_to_ts(inode, uctime);
}
touch_atime(&file->f_path);
@@ -277,7 +279,7 @@ static rwf_t ovl_iocb_to_rwf(int ifl)
static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
{
if (refcount_dec_and_test(&aio_req->ref)) {
- fdput(aio_req->fd);
+ fput(aio_req->iocb.ki_filp);
kmem_cache_free(ovl_aio_request_cachep, aio_req);
}
}
@@ -290,10 +292,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
if (iocb->ki_flags & IOCB_WRITE) {
struct inode *inode = file_inode(orig_iocb->ki_filp);
- /* Actually acquired in ovl_write_iter() */
- __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
- SB_FREEZE_WRITE);
- file_end_write(iocb->ki_filp);
+ kiocb_end_write(iocb);
ovl_copyattr(inode);
}
@@ -342,10 +341,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!aio_req)
goto out;
- aio_req->fd = real;
- real.flags = 0;
aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, real.file);
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
refcount_set(&aio_req->ref, 2);
ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
@@ -393,6 +390,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
+ /*
+ * Overlayfs doesn't support deferred completions, don't copy
+ * this property in case it is set by the issuer.
+ */
+ ifl &= ~IOCB_DIO_CALLER_COMP;
+
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
file_start_write(real.file);
@@ -409,17 +412,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!aio_req)
goto out;
- file_start_write(real.file);
- /* Pacify lockdep, same trick as done in aio_write() */
- __sb_writers_release(file_inode(real.file)->i_sb,
- SB_FREEZE_WRITE);
- aio_req->fd = real;
- real.flags = 0;
aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, real.file);
+ kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
aio_req->iocb.ki_flags = ifl;
aio_req->iocb.ki_complete = ovl_aio_rw_complete;
refcount_set(&aio_req->ref, 2);
+ kiocb_start_write(&aio_req->iocb);
ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
ovl_aio_put(aio_req);
if (ret != -EIOCBQUEUED)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index a63e57447be9..83ef66644c21 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -341,7 +341,7 @@ static const char *ovl_get_link(struct dentry *dentry,
bool ovl_is_private_xattr(struct super_block *sb, const char *name)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
if (ofs->config.userxattr)
return strncmp(name, OVL_XATTR_USER_PREFIX,
@@ -693,10 +693,10 @@ int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
}
#endif
-int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
+int ovl_update_time(struct inode *inode, int flags)
{
if (flags & S_ATIME) {
- struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(inode->i_sb);
struct path upperpath = {
.mnt = ovl_upper_mnt(ofs),
.dentry = ovl_upperdentry_dereference(OVL_I(inode)),
@@ -1291,7 +1291,7 @@ struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir)
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
struct dentry *lower, bool index)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
/* No, if pure upper */
if (!lower)
@@ -1311,7 +1311,7 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
return false;
/* No, if non-indexed upper with NFS export */
- if (sb->s_export_op && upper)
+ if (ofs->config.nfs_export && upper)
return false;
/* Otherwise, hash by lower inode for fsnotify */
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 57adf911735f..80391c687c2a 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -25,7 +25,7 @@ struct ovl_lookup_data {
bool stop;
bool last;
char *redirect;
- bool metacopy;
+ int metacopy;
/* Referring to last redirect xattr */
bool absolute_redirect;
};
@@ -171,8 +171,9 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
* layer where file handle will be decoded.
* In case of uuid=off option just make sure that stored uuid is null.
*/
- if (ofs->config.uuid ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) :
- !uuid_is_null(&fh->fb.uuid))
+ if (ovl_origin_uuid(ofs) ?
+ !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) :
+ !uuid_is_null(&fh->fb.uuid))
return NULL;
bytes = (fh->fb.len - offsetof(struct ovl_fb, fid));
@@ -270,7 +271,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path);
+ err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL);
if (err < 0)
goto out_err;
@@ -889,8 +890,58 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry,
return err;
}
+static int ovl_maybe_validate_verity(struct dentry *dentry)
+{
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
+ struct inode *inode = d_inode(dentry);
+ struct path datapath, metapath;
+ int err;
+
+ if (!ofs->config.verity_mode ||
+ !ovl_is_metacopy_dentry(dentry) ||
+ ovl_test_flag(OVL_VERIFIED_DIGEST, inode))
+ return 0;
+
+ if (!ovl_test_flag(OVL_HAS_DIGEST, inode)) {
+ if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) {
+ pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n",
+ dentry);
+ return -EIO;
+ }
+ return 0;
+ }
+
+ ovl_path_lowerdata(dentry, &datapath);
+ if (!datapath.dentry)
+ return -EIO;
+
+ ovl_path_real(dentry, &metapath);
+ if (!metapath.dentry)
+ return -EIO;
+
+ err = ovl_inode_lock_interruptible(inode);
+ if (err)
+ return err;
+
+ if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) {
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(dentry->d_sb);
+
+ err = ovl_validate_verity(ofs, &metapath, &datapath);
+ if (err == 0)
+ ovl_set_flag(OVL_VERIFIED_DIGEST, inode);
+
+ revert_creds(old_cred);
+ }
+
+ ovl_inode_unlock(inode);
+
+ return err;
+}
+
/* Lazy lookup of lowerdata */
-int ovl_maybe_lookup_lowerdata(struct dentry *dentry)
+static int ovl_maybe_lookup_lowerdata(struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
const char *redirect = ovl_lowerdata_redirect(inode);
@@ -935,12 +986,23 @@ out_err:
goto out;
}
+int ovl_verify_lowerdata(struct dentry *dentry)
+{
+ int err;
+
+ err = ovl_maybe_lookup_lowerdata(dentry);
+ if (err)
+ return err;
+
+ return ovl_maybe_validate_verity(dentry);
+}
+
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct ovl_entry *oe = NULL;
const struct cred *old_cred;
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct ovl_entry *poe = OVL_E(dentry->d_parent);
struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root);
struct ovl_path *stack = NULL, *origin_path = NULL;
@@ -955,6 +1017,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int i;
int err;
bool uppermetacopy = false;
+ int metacopy_size = 0;
struct ovl_lookup_data d = {
.sb = dentry->d_sb,
.name = dentry->d_name,
@@ -963,7 +1026,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
.stop = false,
.last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe),
.redirect = NULL,
- .metacopy = false,
+ .metacopy = 0,
};
if (dentry->d_name.len > ofs->namelen)
@@ -999,6 +1062,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (d.metacopy)
uppermetacopy = true;
+ metacopy_size = d.metacopy;
}
if (d.redirect) {
@@ -1076,6 +1140,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
origin = this;
}
+ if (!upperdentry && !d.is_dir && !ctr && d.metacopy)
+ metacopy_size = d.metacopy;
+
if (d.metacopy && ctr) {
/*
* Do not store intermediate metacopy dentries in
@@ -1120,7 +1187,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
/* Defer lookup of lowerdata in data-only layers to first access */
if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) {
- d.metacopy = false;
+ d.metacopy = 0;
ctr++;
}
@@ -1211,10 +1278,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperredirect = NULL;
goto out_free_oe;
}
- err = ovl_check_metacopy_xattr(ofs, &upperpath);
+ err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL);
if (err < 0)
goto out_free_oe;
uppermetacopy = err;
+ metacopy_size = err;
}
if (upperdentry || ctr) {
@@ -1236,6 +1304,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
goto out_free_oe;
if (upperdentry && !uppermetacopy)
ovl_set_flag(OVL_UPPERDATA, inode);
+
+ if (metacopy_size > OVL_METACOPY_MIN_SIZE)
+ ovl_set_flag(OVL_HAS_DIGEST, inode);
}
ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode));
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 9402591f12aa..9817b2dcb132 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/uuid.h>
#include <linux/fs.h>
+#include <linux/fsverity.h>
#include <linux/namei.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
@@ -36,6 +37,7 @@ enum ovl_xattr {
OVL_XATTR_IMPURE,
OVL_XATTR_NLINK,
OVL_XATTR_UPPER,
+ OVL_XATTR_UUID,
OVL_XATTR_METACOPY,
OVL_XATTR_PROTATTR,
};
@@ -49,6 +51,8 @@ enum ovl_inode_flag {
OVL_UPPERDATA,
/* Inode number will remain constant over copy up. */
OVL_CONST_INO,
+ OVL_HAS_DIGEST,
+ OVL_VERIFIED_DIGEST,
};
enum ovl_entry_flag {
@@ -65,11 +69,24 @@ enum {
};
enum {
+ OVL_UUID_OFF,
+ OVL_UUID_NULL,
+ OVL_UUID_AUTO,
+ OVL_UUID_ON,
+};
+
+enum {
OVL_XINO_OFF,
OVL_XINO_AUTO,
OVL_XINO_ON,
};
+enum {
+ OVL_VERITY_OFF,
+ OVL_VERITY_ON,
+ OVL_VERITY_REQUIRE,
+};
+
/*
* The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
* where:
@@ -126,6 +143,26 @@ struct ovl_fh {
#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \
offsetof(struct ovl_fb, fid))
+/* On-disk format for "metacopy" xattr (if non-zero size) */
+struct ovl_metacopy {
+ u8 version; /* 0 */
+ u8 len; /* size of this header + used digest bytes */
+ u8 flags;
+ u8 digest_algo; /* FS_VERITY_HASH_ALG_* constant, 0 for no digest */
+ u8 digest[FS_VERITY_MAX_DIGEST_SIZE]; /* Only the used part on disk */
+} __packed;
+
+#define OVL_METACOPY_MAX_SIZE (sizeof(struct ovl_metacopy))
+#define OVL_METACOPY_MIN_SIZE (OVL_METACOPY_MAX_SIZE - FS_VERITY_MAX_DIGEST_SIZE)
+#define OVL_METACOPY_INIT { 0, OVL_METACOPY_MIN_SIZE }
+
+static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy)
+{
+ if (metacopy->len < OVL_METACOPY_MIN_SIZE)
+ return 0;
+ return (int)metacopy->len - OVL_METACOPY_MIN_SIZE;
+}
+
extern const char *const ovl_xattr_table[][2];
static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
{
@@ -430,6 +467,8 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags);
bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
+bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
+ const struct path *upperpath);
static inline bool ovl_check_origin_xattr(struct ovl_fs *ofs,
struct dentry *upperdentry)
@@ -452,9 +491,20 @@ bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path);
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path,
+ struct ovl_metacopy *data);
+int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
+ struct ovl_metacopy *metacopy);
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
+int ovl_ensure_verity_loaded(struct path *path);
+int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path,
+ u8 *digest_buf, int *buf_length);
+int ovl_validate_verity(struct ovl_fs *ofs,
+ struct path *metapath,
+ struct path *datapath);
+int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
+ struct ovl_metacopy *metacopy);
int ovl_sync_status(struct ovl_fs *ofs);
static inline void ovl_set_flag(unsigned long flag, struct inode *inode)
@@ -494,6 +544,17 @@ static inline bool ovl_redirect_dir(struct ovl_fs *ofs)
return ofs->config.redirect_mode == OVL_REDIRECT_ON;
}
+static inline bool ovl_origin_uuid(struct ovl_fs *ofs)
+{
+ return ofs->config.uuid != OVL_UUID_OFF;
+}
+
+static inline bool ovl_has_fsid(struct ovl_fs *ofs)
+{
+ return ofs->config.uuid == OVL_UUID_ON ||
+ ofs->config.uuid == OVL_UUID_AUTO;
+}
+
/*
* With xino=auto, we do best effort to keep all inodes on same st_dev and
* d_ino consistent with st_ino.
@@ -574,7 +635,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool verify);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
-int ovl_maybe_lookup_lowerdata(struct dentry *dentry);
+int ovl_verify_lowerdata(struct dentry *dentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
bool ovl_lower_positive(struct dentry *dentry);
@@ -665,7 +726,7 @@ static inline struct posix_acl *ovl_get_acl_path(const struct path *path,
}
#endif
-int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
+int ovl_update_time(struct inode *inode, int flags);
bool ovl_is_private_xattr(struct super_block *sb, const char *name);
struct ovl_inode_params {
@@ -759,6 +820,7 @@ int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
/* export.c */
extern const struct export_operations ovl_export_operations;
+extern const struct export_operations ovl_export_fid_operations;
/* super.c */
int ovl_fill_super(struct super_block *sb, struct fs_context *fc);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 306e1ecdc96d..d82d2a043da2 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -8,10 +8,12 @@
struct ovl_config {
char *upperdir;
char *workdir;
+ char **lowerdirs;
bool default_permissions;
int redirect_mode;
+ int verity_mode;
bool index;
- bool uuid;
+ int uuid;
bool nfs_export;
int xino;
bool metacopy;
@@ -38,17 +40,8 @@ struct ovl_layer {
int idx;
/* One fsid per unique underlying sb (upper fsid == 0) */
int fsid;
- char *name;
};
-/*
- * ovl_free_fs() relies on @mnt being the first member when unmounting
- * the private mounts created for each layer. Let's check both the
- * offset and type.
- */
-static_assert(offsetof(struct ovl_layer, mnt) == 0);
-static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *));
-
struct ovl_path {
const struct ovl_layer *layer;
struct dentry *dentry;
@@ -81,6 +74,7 @@ struct ovl_fs {
const struct cred *creator_cred;
bool tmpfile;
bool noxattr;
+ bool nofh;
/* Did we take the inuse lock? */
bool upperdir_locked;
bool workdir_locked;
@@ -115,8 +109,13 @@ static inline struct mnt_idmap *ovl_upper_mnt_idmap(struct ovl_fs *ofs)
return mnt_idmap(ovl_upper_mnt(ofs));
}
+extern struct file_system_type ovl_fs_type;
+
static inline struct ovl_fs *OVL_FS(struct super_block *sb)
{
+ if (IS_ENABLED(CONFIG_OVERLAY_FS_DEBUG))
+ WARN_ON_ONCE(sb->s_type != &ovl_fs_type);
+
return (struct ovl_fs *)sb->s_fs_info;
}
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index a63160dbb0f9..f6ff23fd101c 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -55,6 +55,7 @@ enum {
Opt_userxattr,
Opt_xino,
Opt_metacopy,
+ Opt_verity,
Opt_volatile,
};
@@ -64,6 +65,24 @@ static const struct constant_table ovl_parameter_bool[] = {
{}
};
+static const struct constant_table ovl_parameter_uuid[] = {
+ { "off", OVL_UUID_OFF },
+ { "null", OVL_UUID_NULL },
+ { "auto", OVL_UUID_AUTO },
+ { "on", OVL_UUID_ON },
+ {}
+};
+
+static const char *ovl_uuid_mode(struct ovl_config *config)
+{
+ return ovl_parameter_uuid[config->uuid].name;
+}
+
+static int ovl_uuid_def(void)
+{
+ return OVL_UUID_AUTO;
+}
+
static const struct constant_table ovl_parameter_xino[] = {
{ "off", OVL_XINO_OFF },
{ "auto", OVL_XINO_AUTO },
@@ -101,6 +120,23 @@ static int ovl_redirect_mode_def(void)
OVL_REDIRECT_NOFOLLOW;
}
+static const struct constant_table ovl_parameter_verity[] = {
+ { "off", OVL_VERITY_OFF },
+ { "on", OVL_VERITY_ON },
+ { "require", OVL_VERITY_REQUIRE },
+ {}
+};
+
+static const char *ovl_verity_mode(struct ovl_config *config)
+{
+ return ovl_parameter_verity[config->verity_mode].name;
+}
+
+static int ovl_verity_mode_def(void)
+{
+ return OVL_VERITY_OFF;
+}
+
#define fsparam_string_empty(NAME, OPT) \
__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
@@ -111,15 +147,44 @@ const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_flag("default_permissions", Opt_default_permissions),
fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir),
fsparam_enum("index", Opt_index, ovl_parameter_bool),
- fsparam_enum("uuid", Opt_uuid, ovl_parameter_bool),
+ fsparam_enum("uuid", Opt_uuid, ovl_parameter_uuid),
fsparam_enum("nfs_export", Opt_nfs_export, ovl_parameter_bool),
fsparam_flag("userxattr", Opt_userxattr),
fsparam_enum("xino", Opt_xino, ovl_parameter_xino),
fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool),
+ fsparam_enum("verity", Opt_verity, ovl_parameter_verity),
fsparam_flag("volatile", Opt_volatile),
{}
};
+static char *ovl_next_opt(char **s)
+{
+ char *sbegin = *s;
+ char *p;
+
+ if (sbegin == NULL)
+ return NULL;
+
+ for (p = sbegin; *p; p++) {
+ if (*p == '\\') {
+ p++;
+ if (!*p)
+ break;
+ } else if (*p == ',') {
+ *p = '\0';
+ *s = p + 1;
+ return sbegin;
+ }
+ }
+ *s = NULL;
+ return sbegin;
+}
+
+static int ovl_parse_monolithic(struct fs_context *fc, void *data)
+{
+ return vfs_parse_monolithic_sep(fc, data, ovl_next_opt);
+}
+
static ssize_t ovl_parse_param_split_lowerdirs(char *str)
{
ssize_t nr_layers = 1, nr_colons = 0;
@@ -127,7 +192,8 @@ static ssize_t ovl_parse_param_split_lowerdirs(char *str)
for (s = d = str;; s++, d++) {
if (*s == '\\') {
- s++;
+ /* keep esc chars in split lowerdir */
+ *d++ = *s++;
} else if (*s == ':') {
bool next_colon = (*(s + 1) == ':');
@@ -202,7 +268,7 @@ static void ovl_unescape(char *s)
}
}
-static int ovl_mount_dir(const char *name, struct path *path)
+static int ovl_mount_dir(const char *name, struct path *path, bool upper)
{
int err = -ENOMEM;
char *tmp = kstrdup(name, GFP_KERNEL);
@@ -211,7 +277,7 @@ static int ovl_mount_dir(const char *name, struct path *path)
ovl_unescape(tmp);
err = ovl_mount_dir_noesc(tmp, path);
- if (!err && path->dentry->d_flags & DCACHE_OP_REAL) {
+ if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) {
pr_err("filesystem on '%s' not supported as upperdir\n",
tmp);
path_put_init(path);
@@ -232,7 +298,7 @@ static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc,
struct path path;
char *dup;
- err = ovl_mount_dir(name, &path);
+ err = ovl_mount_dir(name, &path, true);
if (err)
return err;
@@ -284,12 +350,6 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx)
* Set "/lower1", "/lower2", and "/lower3" as lower layers and
* "/data1" and "/data2" as data lower layers. Any existing lower
* layers are replaced.
- * (2) lowerdir=:/lower4
- * Append "/lower4" to current stack of lower layers. This requires
- * that there already is at least one lower layer configured.
- * (3) lowerdir=::/lower5
- * Append data "/lower5" as data lower layer. This requires that
- * there's at least one regular lower layer present.
*/
static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
{
@@ -311,49 +371,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
return 0;
}
- if (strncmp(name, "::", 2) == 0) {
- /*
- * This is a data layer.
- * There must be at least one regular lower layer
- * specified.
- */
- if (ctx->nr == 0) {
- pr_err("data lower layers without regular lower layers not allowed");
- return -EINVAL;
- }
-
- /* Skip the leading "::". */
- name += 2;
- data_layer = true;
- /*
- * A data layer is automatically an append as there
- * must've been at least one regular lower layer.
- */
- append = true;
- } else if (*name == ':') {
- /*
- * This is a regular lower layer.
- * If users want to append a layer enforce that they
- * have already specified a first layer before. It's
- * better to be strict.
- */
- if (ctx->nr == 0) {
- pr_err("cannot append layer if no previous layer has been specified");
- return -EINVAL;
- }
-
- /*
- * Once a sequence of data layers has started regular
- * lower layers are forbidden.
- */
- if (ctx->nr_data > 0) {
- pr_err("regular lower layers cannot follow data lower layers");
- return -EINVAL;
- }
-
- /* Skip the leading ":". */
- name++;
- append = true;
+ if (*name == ':') {
+ pr_err("cannot append lower layer");
+ return -EINVAL;
}
dup = kstrdup(name, GFP_KERNEL);
@@ -435,7 +455,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
l = &ctx->lower[nr];
memset(l, 0, sizeof(*l));
- err = ovl_mount_dir_noesc(dup_iter, &l->path);
+ err = ovl_mount_dir(dup_iter, &l->path, false);
if (err)
goto out_put;
@@ -572,6 +592,9 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
config->metacopy = result.uint_32;
ctx->set.metacopy = true;
break;
+ case Opt_verity:
+ config->verity_mode = result.uint_32;
+ break;
case Opt_volatile:
config->ovl_volatile = true;
break;
@@ -622,7 +645,7 @@ static void ovl_free(struct fs_context *fc)
static int ovl_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct super_block *upper_sb;
int ret = 0;
@@ -642,6 +665,7 @@ static int ovl_reconfigure(struct fs_context *fc)
}
static const struct fs_context_operations ovl_context_ops = {
+ .parse_monolithic = ovl_parse_monolithic,
.parse_param = ovl_parse_param,
.get_tree = ovl_get_tree,
.reconfigure = ovl_reconfigure,
@@ -679,7 +703,7 @@ int ovl_init_fs_context(struct fs_context *fc)
ofs->config.redirect_mode = ovl_redirect_mode_def();
ofs->config.index = ovl_index_def;
- ofs->config.uuid = true;
+ ofs->config.uuid = ovl_uuid_def();
ofs->config.nfs_export = ovl_nfs_export_def;
ofs->config.xino = ovl_xino_def();
ofs->config.metacopy = ovl_metacopy_def;
@@ -712,12 +736,12 @@ void ovl_free_fs(struct ovl_fs *ofs)
if (ofs->upperdir_locked)
ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root);
- /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */
- mounts = (struct vfsmount **) ofs->layers;
+ /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */
+ mounts = (struct vfsmount **) ofs->config.lowerdirs;
for (i = 0; i < ofs->numlayer; i++) {
iput(ofs->layers[i].trap);
+ kfree(ofs->config.lowerdirs[i]);
mounts[i] = ofs->layers[i].mnt;
- kfree(ofs->layers[i].name);
}
kern_unmount_array(mounts, ofs->numlayer);
kfree(ofs->layers);
@@ -725,6 +749,7 @@ void ovl_free_fs(struct ovl_fs *ofs)
free_anon_bdev(ofs->fs[i].pseudo_dev);
kfree(ofs->fs);
+ kfree(ofs->config.lowerdirs);
kfree(ofs->config.upperdir);
kfree(ofs->config.workdir);
if (ofs->creator_cred)
@@ -762,6 +787,23 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
config->ovl_volatile = false;
}
+ if (!config->upperdir && config->uuid == OVL_UUID_ON) {
+ pr_info("option \"uuid=on\" requires an upper fs, falling back to uuid=null.\n");
+ config->uuid = OVL_UUID_NULL;
+ }
+
+ /* Resolve verity -> metacopy dependency */
+ if (config->verity_mode && !config->metacopy) {
+ /* Don't allow explicit specified conflicting combinations */
+ if (set.metacopy) {
+ pr_err("conflicting options: metacopy=off,verity=%s\n",
+ ovl_verity_mode(config));
+ return -EINVAL;
+ }
+ /* Otherwise automatically enable metacopy. */
+ config->metacopy = true;
+ }
+
/*
* This is to make the logic below simpler. It doesn't make any other
* difference, since redirect_dir=on is only used for upper.
@@ -769,13 +811,18 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW)
config->redirect_mode = OVL_REDIRECT_ON;
- /* Resolve metacopy -> redirect_dir dependency */
+ /* Resolve verity -> metacopy -> redirect_dir dependency */
if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) {
if (set.metacopy && set.redirect) {
pr_err("conflicting options: metacopy=on,redirect_dir=%s\n",
ovl_redirect_mode(config));
return -EINVAL;
}
+ if (config->verity_mode && set.redirect) {
+ pr_err("conflicting options: verity=%s,redirect_dir=%s\n",
+ ovl_verity_mode(config), ovl_redirect_mode(config));
+ return -EINVAL;
+ }
if (set.redirect) {
/*
* There was an explicit redirect_dir=... that resulted
@@ -812,7 +859,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
}
}
- /* Resolve nfs_export -> !metacopy dependency */
+ /* Resolve nfs_export -> !metacopy && !verity dependency */
if (config->nfs_export && config->metacopy) {
if (set.nfs_export && set.metacopy) {
pr_err("conflicting options: nfs_export=on,metacopy=on\n");
@@ -825,6 +872,14 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
*/
pr_info("disabling nfs_export due to metacopy=on\n");
config->nfs_export = false;
+ } else if (config->verity_mode) {
+ /*
+ * There was an explicit verity=.. that resulted
+ * in this conflict.
+ */
+ pr_info("disabling nfs_export due to verity=%s\n",
+ ovl_verity_mode(config));
+ config->nfs_export = false;
} else {
/*
* There was an explicit nfs_export=on that resulted
@@ -836,7 +891,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
}
- /* Resolve userxattr -> !redirect && !metacopy dependency */
+ /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */
if (config->userxattr) {
if (set.redirect &&
config->redirect_mode != OVL_REDIRECT_NOFOLLOW) {
@@ -848,6 +903,11 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
pr_err("conflicting options: userxattr,metacopy=on\n");
return -EINVAL;
}
+ if (config->verity_mode) {
+ pr_err("conflicting options: userxattr,verity=%s\n",
+ ovl_verity_mode(config));
+ return -EINVAL;
+ }
/*
* Silently disable default setting of redirect and metacopy.
* This shall be the default in the future as well: these
@@ -872,18 +932,25 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
int ovl_show_options(struct seq_file *m, struct dentry *dentry)
{
struct super_block *sb = dentry->d_sb;
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer;
- const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower];
-
- /* ofs->layers[0] is the upper layer */
- seq_printf(m, ",lowerdir=%s", ofs->layers[1].name);
- /* dump regular lower layers */
- for (nr = 2; nr < nr_merged_lower; nr++)
- seq_printf(m, ":%s", ofs->layers[nr].name);
- /* dump data lower layers */
- for (nr = 0; nr < ofs->numdatalayer; nr++)
- seq_printf(m, "::%s", data_layers[nr].name);
+
+ /*
+ * lowerdirs[] starts from offset 1, then
+ * >= 0 regular lower layers prefixed with : and
+ * >= 0 data-only lower layers prefixed with ::
+ *
+ * we need to escase comma and space like seq_show_option() does and
+ * we also need to escape the colon separator from lowerdir paths.
+ */
+ seq_puts(m, ",lowerdir=");
+ for (nr = 1; nr < ofs->numlayer; nr++) {
+ if (nr > 1)
+ seq_putc(m, ':');
+ if (nr >= nr_merged_lower)
+ seq_putc(m, ':');
+ seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\");
+ }
if (ofs->config.upperdir) {
seq_show_option(m, "upperdir", ofs->config.upperdir);
seq_show_option(m, "workdir", ofs->config.workdir);
@@ -895,8 +962,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
ovl_redirect_mode(&ofs->config));
if (ofs->config.index != ovl_index_def)
seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
- if (!ofs->config.uuid)
- seq_puts(m, ",uuid=off");
+ if (ofs->config.uuid != ovl_uuid_def())
+ seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config));
if (ofs->config.nfs_export != ovl_nfs_export_def)
seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
"on" : "off");
@@ -909,5 +976,8 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
seq_puts(m, ",volatile");
if (ofs->config.userxattr)
seq_puts(m, ",userxattr");
+ if (ofs->config.verity_mode != ovl_verity_mode_def())
+ seq_printf(m, ",verity=%s",
+ ovl_verity_mode(&ofs->config));
return 0;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index cc8977498c48..3fa2416264a4 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -32,6 +32,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
const struct inode *inode)
{
struct dentry *real = NULL, *lower;
+ int err;
/* It's an overlay file */
if (inode && d_inode(dentry) == inode)
@@ -58,7 +59,9 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
* uprobes on offset within the file, so lowerdata should be available
* when setting the uprobe.
*/
- ovl_maybe_lookup_lowerdata(dentry);
+ err = ovl_verify_lowerdata(dentry);
+ if (err)
+ goto bug;
lower = ovl_dentry_lowerdata(dentry);
if (!lower)
goto bug;
@@ -101,8 +104,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak)
static int ovl_dentry_revalidate_common(struct dentry *dentry,
unsigned int flags, bool weak)
{
- struct ovl_entry *oe = OVL_E(dentry);
- struct ovl_path *lowerstack = ovl_lowerstack(oe);
+ struct ovl_entry *oe;
+ struct ovl_path *lowerstack;
struct inode *inode = d_inode_rcu(dentry);
struct dentry *upper;
unsigned int i;
@@ -112,6 +115,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry,
if (!inode)
return -ECHILD;
+ oe = OVL_I_E(inode);
+ lowerstack = ovl_lowerstack(oe);
upper = ovl_i_dentry_upper(inode);
if (upper)
ret = ovl_revalidate_real(upper, flags, weak);
@@ -164,6 +169,7 @@ static void ovl_free_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
kfree(oi->redirect);
+ kfree(oi->oe);
mutex_destroy(&oi->lock);
kmem_cache_free(ovl_inode_cachep, oi);
}
@@ -173,7 +179,7 @@ static void ovl_destroy_inode(struct inode *inode)
struct ovl_inode *oi = OVL_I(inode);
dput(oi->__upperdentry);
- ovl_free_entry(oi->oe);
+ ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe));
if (S_ISDIR(inode->i_mode))
ovl_dir_cache_free(inode);
else
@@ -182,7 +188,7 @@ static void ovl_destroy_inode(struct inode *inode)
static void ovl_put_super(struct super_block *sb)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
if (ofs)
ovl_free_fs(ofs);
@@ -191,7 +197,7 @@ static void ovl_put_super(struct super_block *sb)
/* Sync real dirty inodes in upper filesystem (if it exists) */
static int ovl_sync_fs(struct super_block *sb, int wait)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct super_block *upper_sb;
int ret;
@@ -239,8 +245,9 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
*/
static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
- struct dentry *root_dentry = dentry->d_sb->s_root;
+ struct super_block *sb = dentry->d_sb;
+ struct ovl_fs *ofs = OVL_FS(sb);
+ struct dentry *root_dentry = sb->s_root;
struct path path;
int err;
@@ -250,6 +257,8 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
if (!err) {
buf->f_namelen = ofs->namelen;
buf->f_type = OVERLAYFS_SUPER_MAGIC;
+ if (ovl_has_fsid(ofs))
+ buf->f_fsid = uuid_to_fsid(sb->s_uuid.b);
}
return err;
@@ -397,6 +406,7 @@ static int ovl_lower_dir(const char *name, struct path *path,
pr_warn("fs on '%s' does not support file handles, falling back to index=off,nfs_export=off.\n",
name);
}
+ ofs->nofh |= !fh_type;
/*
* Decoding origin file handle is required for persistent st_ino.
* Without persistent st_ino, xino=auto falls back to xino=off.
@@ -562,11 +572,6 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs,
upper_layer->idx = 0;
upper_layer->fsid = 0;
- err = -ENOMEM;
- upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL);
- if (!upper_layer->name)
- goto out;
-
/*
* Inherit SB_NOSEC flag from upperdir.
*
@@ -770,6 +775,10 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
ofs->config.index = false;
pr_warn("...falling back to index=off.\n");
}
+ if (ovl_has_fsid(ofs)) {
+ ofs->config.uuid = OVL_UUID_NULL;
+ pr_warn("...falling back to uuid=null.\n");
+ }
/*
* xattr support is required for persistent st_ino.
* Without persistent st_ino, xino=auto falls back to xino=off.
@@ -815,6 +824,7 @@ static int ovl_make_workdir(struct super_block *sb, struct ovl_fs *ofs,
ofs->config.index = false;
pr_warn("upper fs does not support file handles, falling back to index=off.\n");
}
+ ofs->nofh |= !fh_type;
/* Check if upper fs has 32bit inode numbers */
if (fh_type != FILEID_INO32_GEN)
@@ -1110,7 +1120,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs,
layers[ofs->numlayer].idx = ofs->numlayer;
layers[ofs->numlayer].fsid = fsid;
layers[ofs->numlayer].fs = &ofs->fs[fsid];
- layers[ofs->numlayer].name = l->name;
+ /* Store for printing lowerdir=... in ovl_show_options() */
+ ofs->config.lowerdirs[ofs->numlayer] = l->name;
l->name = NULL;
ofs->numlayer++;
ofs->fs[fsid].is_lower = true;
@@ -1355,8 +1366,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (!layers)
goto out_err;
+ ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL);
+ if (!ofs->config.lowerdirs) {
+ kfree(layers);
+ goto out_err;
+ }
ofs->layers = layers;
- /* Layer 0 is reserved for upper even if there's no upper */
+ /*
+ * Layer 0 is reserved for upper even if there's no upper.
+ * For consistency, config.lowerdirs[0] is NULL.
+ */
ofs->numlayer = 1;
sb->s_stack_depth = 0;
@@ -1416,9 +1435,12 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (!ovl_upper_mnt(ofs))
sb->s_flags |= SB_RDONLY;
- if (!ofs->config.uuid && ofs->numfs > 1) {
- pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=on.\n");
- ofs->config.uuid = true;
+ if (!ovl_origin_uuid(ofs) && ofs->numfs > 1) {
+ pr_warn("The uuid=off requires a single fs for lower and upper, falling back to uuid=null.\n");
+ ofs->config.uuid = OVL_UUID_NULL;
+ } else if (ovl_has_fsid(ofs) && ovl_upper_mnt(ofs)) {
+ /* Use per instance persistent uuid/fsid */
+ ovl_init_uuid_xattr(sb, ofs, &ctx->upper);
}
if (!ovl_force_readonly(ofs) && ofs->config.index) {
@@ -1449,8 +1471,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
ofs->config.nfs_export = false;
}
+ /*
+ * Support encoding decodable file handles with nfs_export=on
+ * and encoding non-decodable file handles with nfs_export=off
+ * if all layers support file handles.
+ */
if (ofs->config.nfs_export)
sb->s_export_op = &ovl_export_operations;
+ else if (!ofs->nofh)
+ sb->s_export_op = &ovl_export_fid_operations;
/* Never override disk quota limits or use reserved space */
cap_lower(cred->cap_effective, CAP_SYS_RESOURCE);
@@ -1479,7 +1508,7 @@ out_err:
return err;
}
-static struct file_system_type ovl_fs_type = {
+struct file_system_type ovl_fs_type = {
.owner = THIS_MODULE,
.name = "overlay",
.init_fs_context = ovl_init_fs_context,
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 7ef9e13c404a..89e0d60d35b6 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -10,6 +10,7 @@
#include <linux/cred.h>
#include <linux/xattr.h>
#include <linux/exportfs.h>
+#include <linux/file.h>
#include <linux/fileattr.h>
#include <linux/uuid.h>
#include <linux/namei.h>
@@ -18,25 +19,25 @@
int ovl_want_write(struct dentry *dentry)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
return mnt_want_write(ovl_upper_mnt(ofs));
}
void ovl_drop_write(struct dentry *dentry)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
mnt_drop_write(ovl_upper_mnt(ofs));
}
struct dentry *ovl_workdir(struct dentry *dentry)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
return ofs->workdir;
}
const struct cred *ovl_override_creds(struct super_block *sb)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
return override_creds(ofs->creator_cred);
}
@@ -62,7 +63,7 @@ int ovl_can_decode_fh(struct super_block *sb)
struct dentry *ovl_indexdir(struct super_block *sb)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
return ofs->indexdir;
}
@@ -70,7 +71,7 @@ struct dentry *ovl_indexdir(struct super_block *sb)
/* Index all files on copy up. For now only enabled for NFS export */
bool ovl_index_all(struct super_block *sb)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
return ofs->config.nfs_export && ofs->config.index;
}
@@ -78,7 +79,7 @@ bool ovl_index_all(struct super_block *sb)
/* Verify lower origin on lookup. For now only enabled for NFS export */
bool ovl_verify_lower(struct super_block *sb)
{
- struct ovl_fs *ofs = sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(sb);
return ofs->config.nfs_export && ofs->config.index;
}
@@ -203,7 +204,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
void ovl_path_upper(struct dentry *dentry, struct path *path)
{
- struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+ struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
path->mnt = ovl_upper_mnt(ofs);
path->dentry = ovl_dentry_upper(dentry);
@@ -675,6 +676,65 @@ bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path)
return false;
}
+/*
+ * Load persistent uuid from xattr into s_uuid if found, or store a new
+ * random generated value in s_uuid and in xattr.
+ */
+bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
+ const struct path *upperpath)
+{
+ bool set = false;
+ int res;
+
+ /* Try to load existing persistent uuid */
+ res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b,
+ UUID_SIZE);
+ if (res == UUID_SIZE)
+ return true;
+
+ if (res != -ENODATA)
+ goto fail;
+
+ /*
+ * With uuid=auto, if uuid xattr is found, it will be used.
+ * If uuid xattrs is not found, generate a persistent uuid only on mount
+ * of new overlays where upper root dir is not yet marked as impure.
+ * An upper dir is marked as impure on copy up or lookup of its subdirs.
+ */
+ if (ofs->config.uuid == OVL_UUID_AUTO) {
+ res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_IMPURE, NULL,
+ 0);
+ if (res > 0) {
+ /* Any mount of old overlay - downgrade to uuid=null */
+ ofs->config.uuid = OVL_UUID_NULL;
+ return true;
+ } else if (res == -ENODATA) {
+ /* First mount of new overlay - upgrade to uuid=on */
+ ofs->config.uuid = OVL_UUID_ON;
+ } else if (res < 0) {
+ goto fail;
+ }
+
+ }
+
+ /* Generate overlay instance uuid */
+ uuid_gen(&sb->s_uuid);
+
+ /* Try to store persistent uuid */
+ set = true;
+ res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b,
+ UUID_SIZE);
+ if (res == 0)
+ return true;
+
+fail:
+ memset(sb->s_uuid.b, 0, UUID_SIZE);
+ ofs->config.uuid = OVL_UUID_NULL;
+ pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n",
+ set ? "set" : "get", upperpath->dentry, res);
+ return false;
+}
+
bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
enum ovl_xattr ox)
{
@@ -697,6 +757,7 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
#define OVL_XATTR_IMPURE_POSTFIX "impure"
#define OVL_XATTR_NLINK_POSTFIX "nlink"
#define OVL_XATTR_UPPER_POSTFIX "upper"
+#define OVL_XATTR_UUID_POSTFIX "uuid"
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
@@ -711,6 +772,7 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_IMPURE),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER),
+ OVL_XATTR_TAB_ENTRY(OVL_XATTR_UUID),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
};
@@ -1054,8 +1116,12 @@ err:
return -EIO;
}
-/* err < 0, 0 if no metacopy xattr, 1 if metacopy xattr found */
-int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path)
+/*
+ * err < 0, 0 if no metacopy xattr, metacopy data size if xattr found.
+ * an empty xattr returns OVL_METACOPY_MIN_SIZE to distinguish from no xattr value.
+ */
+int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path,
+ struct ovl_metacopy *data)
{
int res;
@@ -1063,7 +1129,8 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path)
if (!S_ISREG(d_inode(path->dentry)->i_mode))
return 0;
- res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY, NULL, 0);
+ res = ovl_path_getxattr(ofs, path, OVL_XATTR_METACOPY,
+ data, data ? OVL_METACOPY_MAX_SIZE : 0);
if (res < 0) {
if (res == -ENODATA || res == -EOPNOTSUPP)
return 0;
@@ -1077,12 +1144,48 @@ int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path)
goto out;
}
- return 1;
+ if (res == 0) {
+ /* Emulate empty data for zero size metacopy xattr */
+ res = OVL_METACOPY_MIN_SIZE;
+ if (data) {
+ memset(data, 0, res);
+ data->len = res;
+ }
+ } else if (res < OVL_METACOPY_MIN_SIZE) {
+ pr_warn_ratelimited("metacopy file '%pd' has too small xattr\n",
+ path->dentry);
+ return -EIO;
+ } else if (data) {
+ if (data->version != 0) {
+ pr_warn_ratelimited("metacopy file '%pd' has unsupported version\n",
+ path->dentry);
+ return -EIO;
+ }
+ if (res != data->len) {
+ pr_warn_ratelimited("metacopy file '%pd' has invalid xattr size\n",
+ path->dentry);
+ return -EIO;
+ }
+ }
+
+ return res;
out:
pr_warn_ratelimited("failed to get metacopy (%i)\n", res);
return res;
}
+int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy)
+{
+ size_t len = metacopy->len;
+
+ /* If no flags or digest fall back to empty metacopy file */
+ if (metacopy->version == 0 && metacopy->flags == 0 && metacopy->digest_algo == 0)
+ len = 0;
+
+ return ovl_check_setxattr(ofs, d, OVL_XATTR_METACOPY,
+ metacopy, len, -EOPNOTSUPP);
+}
+
bool ovl_is_metacopy_dentry(struct dentry *dentry)
{
struct ovl_entry *oe = OVL_E(dentry);
@@ -1145,6 +1248,112 @@ err_free:
return ERR_PTR(res);
}
+/* Call with mounter creds as it may open the file */
+int ovl_ensure_verity_loaded(struct path *datapath)
+{
+ struct inode *inode = d_inode(datapath->dentry);
+ struct file *filp;
+
+ if (!fsverity_active(inode) && IS_VERITY(inode)) {
+ /*
+ * If this inode was not yet opened, the verity info hasn't been
+ * loaded yet, so we need to do that here to force it into memory.
+ */
+ filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred());
+ if (IS_ERR(filp))
+ return PTR_ERR(filp);
+ fput(filp);
+ }
+
+ return 0;
+}
+
+int ovl_validate_verity(struct ovl_fs *ofs,
+ struct path *metapath,
+ struct path *datapath)
+{
+ struct ovl_metacopy metacopy_data;
+ u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE];
+ int xattr_digest_size, digest_size;
+ int xattr_size, err;
+ u8 verity_algo;
+
+ if (!ofs->config.verity_mode ||
+ /* Verity only works on regular files */
+ !S_ISREG(d_inode(metapath->dentry)->i_mode))
+ return 0;
+
+ xattr_size = ovl_check_metacopy_xattr(ofs, metapath, &metacopy_data);
+ if (xattr_size < 0)
+ return xattr_size;
+
+ if (!xattr_size || !metacopy_data.digest_algo) {
+ if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) {
+ pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n",
+ metapath->dentry);
+ return -EIO;
+ }
+ return 0;
+ }
+
+ xattr_digest_size = ovl_metadata_digest_size(&metacopy_data);
+
+ err = ovl_ensure_verity_loaded(datapath);
+ if (err < 0) {
+ pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n",
+ datapath->dentry);
+ return -EIO;
+ }
+
+ digest_size = fsverity_get_digest(d_inode(datapath->dentry), actual_digest,
+ &verity_algo, NULL);
+ if (digest_size == 0) {
+ pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n", datapath->dentry);
+ return -EIO;
+ }
+
+ if (xattr_digest_size != digest_size ||
+ metacopy_data.digest_algo != verity_algo ||
+ memcmp(metacopy_data.digest, actual_digest, xattr_digest_size) != 0) {
+ pr_warn_ratelimited("lower file '%pd' has the wrong fs-verity digest\n",
+ datapath->dentry);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src,
+ struct ovl_metacopy *metacopy)
+{
+ int err, digest_size;
+
+ if (!ofs->config.verity_mode || !S_ISREG(d_inode(src->dentry)->i_mode))
+ return 0;
+
+ err = ovl_ensure_verity_loaded(src);
+ if (err < 0) {
+ pr_warn_ratelimited("lower file '%pd' failed to load fs-verity info\n",
+ src->dentry);
+ return -EIO;
+ }
+
+ digest_size = fsverity_get_digest(d_inode(src->dentry),
+ metacopy->digest, &metacopy->digest_algo, NULL);
+ if (digest_size == 0 ||
+ WARN_ON_ONCE(digest_size > FS_VERITY_MAX_DIGEST_SIZE)) {
+ if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) {
+ pr_warn_ratelimited("lower file '%pd' has no fs-verity digest\n",
+ src->dentry);
+ return -EIO;
+ }
+ return 0;
+ }
+
+ metacopy->len += digest_size;
+ return 0;
+}
+
/*
* ovl_sync_status() - Check fs sync status for volatile mounts
*
@@ -1202,6 +1411,6 @@ void ovl_copyattr(struct inode *inode)
inode->i_mode = realinode->i_mode;
inode->i_atime = realinode->i_atime;
inode->i_mtime = realinode->i_mtime;
- inode->i_ctime = realinode->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(realinode));
i_size_write(inode, i_size_read(realinode));
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 2d88f73f585a..139190165a1c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -489,7 +489,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[head & mask];
+ struct pipe_buffer *buf;
struct page *page = pipe->tmp_page;
int copied;
@@ -537,7 +537,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
break;
}
ret += copied;
- buf->offset = 0;
buf->len = copied;
if (!iov_iter_count(from))
@@ -899,7 +898,7 @@ static struct inode * get_pipe_inode(void)
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
return inode;
@@ -1236,7 +1235,7 @@ const struct file_operations pipefifo_fops = {
* Currently we rely on the pipe array holding a power-of-2 number
* of pages. Returns 0 on error.
*/
-unsigned int round_pipe_size(unsigned long size)
+unsigned int round_pipe_size(unsigned int size)
{
if (size > (1U << 31))
return 0;
@@ -1319,7 +1318,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
* Allocate a new array of pipe buffers and copy the info over. Returns the
* pipe size if successful, or return -ERROR on error.
*/
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
{
unsigned long user_bufs;
unsigned int nr_slots, size;
@@ -1387,7 +1386,7 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
return pipe;
}
-long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
struct pipe_inode_info *pipe;
long ret;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 7fa1b738bbab..a05fe94970ce 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1027,7 +1027,7 @@ int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
return error;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_I_VERSION(inode))
inode_inc_iversion(inode);
set_cached_acl(inode, type, acl);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d35bbf35a874..2c2efbe685d8 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -431,6 +431,11 @@ static inline void task_untag_mask(struct seq_file *m, struct mm_struct *mm)
seq_printf(m, "untag_mask:\t%#lx\n", mm_untag_mask(mm));
}
+__weak void arch_proc_pid_thread_features(struct seq_file *m,
+ struct task_struct *task)
+{
+}
+
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -455,6 +460,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
+ arch_proc_pid_thread_features(m, task);
return 0;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9df3f4839662..ffd54617c354 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1902,7 +1902,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb,
ei = PROC_I(inode);
inode->i_mode = mode;
inode->i_ino = get_next_ino();
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_op = &proc_def_inode_operations;
/*
@@ -1966,7 +1966,7 @@ int pid_getattr(struct mnt_idmap *idmap, const struct path *path,
struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb);
struct task_struct *task;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->uid = GLOBAL_ROOT_UID;
stat->gid = GLOBAL_ROOT_GID;
@@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
mm = get_task_mm(task);
if (mm) {
seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items);
+ seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages);
seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages);
seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
mmput(mm);
@@ -3583,7 +3584,8 @@ static int proc_tid_comm_permission(struct mnt_idmap *idmap,
}
static const struct inode_operations proc_tid_comm_inode_operations = {
- .permission = proc_tid_comm_permission,
+ .setattr = proc_setattr,
+ .permission = proc_tid_comm_permission,
};
/*
@@ -3813,11 +3815,10 @@ static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
/* If we haven't found our starting place yet start
* with the leader and walk nr threads forward.
*/
- pos = task = task->group_leader;
- do {
+ for_each_thread(task, pos) {
if (!nr--)
goto found;
- } while_each_thread(task, pos);
+ };
fail:
pos = NULL;
goto out;
@@ -3899,7 +3900,7 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
{
struct inode *inode = d_inode(path->dentry);
struct task_struct *p = get_proc_task(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (p) {
stat->nlink += get_nr_threads(p);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index b3140deebbbf..6276b3938842 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -352,7 +352,7 @@ static int proc_fd_getattr(struct mnt_idmap *idmap,
struct inode *inode = d_inode(path->dentry);
int rv = 0;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
/* If it's a directory, put the number of open fds there */
if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 42ae38ff6e7e..775ce0bcf08c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -146,7 +146,7 @@ static int proc_getattr(struct mnt_idmap *idmap,
}
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
return 0;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 67b09a1d9433..532dc9d240f7 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -660,7 +660,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
inode->i_private = de->data;
inode->i_ino = de->low_ino;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
PROC_I(inode)->pde = de;
if (is_empty_pde(de)) {
make_empty_dir_inode(inode);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9dda7e54b2d0..9a8f32f21ff5 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -289,9 +289,7 @@ struct proc_maps_private {
struct inode *inode;
struct task_struct *task;
struct mm_struct *mm;
-#ifdef CONFIG_MMU
struct vma_iterator iter;
-#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8dca4d6d96c7..45af9a989d40 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -17,6 +17,7 @@
#ifdef CONFIG_CMA
#include <linux/cma.h>
#endif
+#include <linux/zswap.h>
#include <asm/page.h>
#include "internal.h"
@@ -132,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "VmallocChunk: ", 0ul);
show_val_kb(m, "Percpu: ", pcpu_nr_pages());
-#ifdef CONFIG_MEMTEST
- if (early_memtest_done) {
- unsigned long early_memtest_bad_size_kb;
-
- early_memtest_bad_size_kb = early_memtest_bad_size>>10;
- if (early_memtest_bad_size && !early_memtest_bad_size_kb)
- early_memtest_bad_size_kb = 1;
- /* When 0 is reported, it means there actually was a successful test */
- seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb);
- }
-#endif
+ memtest_report_meminfo(m);
#ifdef CONFIG_MEMORY_FAILURE
seq_printf(m, "HardwareCorrupted: %5lu kB\n",
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index a0c0419872e3..2ba31b6d68c0 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
net = get_proc_task_net(inode);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (net != NULL) {
stat->nlink = net->proc_net->nlink;
@@ -321,6 +321,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap,
const struct inode_operations proc_net_inode_operations = {
.lookup = proc_tgid_net_lookup,
.getattr = proc_tgid_net_getattr,
+ .setattr = proc_setattr,
};
static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5ea42653126e..c88854df0b62 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -19,8 +19,9 @@
#include <linux/kmemleak.h>
#include "internal.h"
-#define list_for_each_table_entry(entry, table) \
- for ((entry) = (table); (entry)->procname; (entry)++)
+#define list_for_each_table_entry(entry, header) \
+ entry = header->ctl_table; \
+ for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++)
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
@@ -43,7 +44,7 @@ static struct ctl_table sysctl_mount_point[] = {
*/
struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- return register_sysctl(path, sysctl_mount_point);
+ return register_sysctl_sz(path, sysctl_mount_point, 0);
}
EXPORT_SYMBOL(register_sysctl_mount_point);
@@ -188,9 +189,10 @@ static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
static void init_header(struct ctl_table_header *head,
struct ctl_table_root *root, struct ctl_table_set *set,
- struct ctl_node *node, struct ctl_table *table)
+ struct ctl_node *node, struct ctl_table *table, size_t table_size)
{
head->ctl_table = table;
+ head->ctl_table_size = table_size;
head->ctl_table_arg = table;
head->used = 0;
head->count = 1;
@@ -204,7 +206,7 @@ static void init_header(struct ctl_table_header *head,
if (node) {
struct ctl_table *entry;
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, head) {
node->header = head;
node++;
}
@@ -215,7 +217,7 @@ static void erase_header(struct ctl_table_header *head)
{
struct ctl_table *entry;
- list_for_each_table_entry(entry, head->ctl_table)
+ list_for_each_table_entry(entry, head)
erase_entry(head, entry);
}
@@ -242,7 +244,7 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
err = insert_links(header);
if (err)
goto fail_links;
- list_for_each_table_entry(entry, header->ctl_table) {
+ list_for_each_table_entry(entry, header) {
err = insert_entry(header, entry);
if (err)
goto fail;
@@ -463,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
head->count++;
spin_unlock(&sysctl_lock);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
inode->i_mode |= S_IFREG;
@@ -849,7 +851,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap,
if (IS_ERR(head))
return PTR_ERR(head);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (table)
stat->mode = (stat->mode & S_IFMT) | table->mode;
@@ -973,7 +975,7 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
memcpy(new_name, name, namelen);
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
- init_header(&new->header, set->dir.header.root, set, node, table);
+ init_header(&new->header, set->dir.header.root, set, node, table, 1);
return new;
}
@@ -1125,11 +1127,11 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
return err;
}
-static int sysctl_check_table(const char *path, struct ctl_table *table)
+static int sysctl_check_table(const char *path, struct ctl_table_header *header)
{
struct ctl_table *entry;
int err = 0;
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, header) {
if ((entry->proc_handler == proc_dostring) ||
(entry->proc_handler == proc_dobool) ||
(entry->proc_handler == proc_dointvec) ||
@@ -1159,8 +1161,7 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
return err;
}
-static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
- struct ctl_table_root *link_root)
+static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_header *head)
{
struct ctl_table *link_table, *entry, *link;
struct ctl_table_header *links;
@@ -1170,7 +1171,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
name_bytes = 0;
nr_entries = 0;
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, head) {
nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
@@ -1189,31 +1190,33 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table
link_name = (char *)&link_table[nr_entries + 1];
link = link_table;
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, head) {
int len = strlen(entry->procname) + 1;
memcpy(link_name, entry->procname, len);
link->procname = link_name;
link->mode = S_IFLNK|S_IRWXUGO;
- link->data = link_root;
+ link->data = head->root;
link_name += len;
link++;
}
- init_header(links, dir->header.root, dir->header.set, node, link_table);
+ init_header(links, dir->header.root, dir->header.set, node, link_table,
+ head->ctl_table_size);
links->nreg = nr_entries;
return links;
}
static bool get_links(struct ctl_dir *dir,
- struct ctl_table *table, struct ctl_table_root *link_root)
+ struct ctl_table_header *header,
+ struct ctl_table_root *link_root)
{
- struct ctl_table_header *head;
+ struct ctl_table_header *tmp_head;
struct ctl_table *entry, *link;
/* Are there links available for every entry in table? */
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
if (!link)
return false;
if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
@@ -1224,10 +1227,10 @@ static bool get_links(struct ctl_dir *dir,
}
/* The checks passed. Increase the registration count on the links */
- list_for_each_table_entry(entry, table) {
+ list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
- link = find_entry(&head, dir, procname, strlen(procname));
- head->nreg++;
+ link = find_entry(&tmp_head, dir, procname, strlen(procname));
+ tmp_head->nreg++;
}
return true;
}
@@ -1246,13 +1249,13 @@ static int insert_links(struct ctl_table_header *head)
if (IS_ERR(core_parent))
return 0;
- if (get_links(core_parent, head->ctl_table, head->root))
+ if (get_links(core_parent, head, head->root))
return 0;
core_parent->header.nreg++;
spin_unlock(&sysctl_lock);
- links = new_links(core_parent, head->ctl_table, head->root);
+ links = new_links(core_parent, head);
spin_lock(&sysctl_lock);
err = -ENOMEM;
@@ -1260,7 +1263,7 @@ static int insert_links(struct ctl_table_header *head)
goto out;
err = 0;
- if (get_links(core_parent, head->ctl_table, head->root)) {
+ if (get_links(core_parent, head, head->root)) {
kfree(links);
goto out;
}
@@ -1310,6 +1313,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* should not be free'd after registration. So it should not be
* used on stack. It can either be a global or dynamically allocated
* by the caller and free'd later after sysctl unregistration.
+ * @table_size : The number of elements in table
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
@@ -1352,26 +1356,21 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
*/
struct ctl_table_header *__register_sysctl_table(
struct ctl_table_set *set,
- const char *path, struct ctl_table *table)
+ const char *path, struct ctl_table *table, size_t table_size)
{
struct ctl_table_root *root = set->dir.header.root;
struct ctl_table_header *header;
struct ctl_dir *dir;
- struct ctl_table *entry;
struct ctl_node *node;
- int nr_entries = 0;
-
- list_for_each_table_entry(entry, table)
- nr_entries++;
header = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries, GFP_KERNEL_ACCOUNT);
+ sizeof(struct ctl_node)*table_size, GFP_KERNEL_ACCOUNT);
if (!header)
return NULL;
node = (struct ctl_node *)(header + 1);
- init_header(header, root, set, node, table);
- if (sysctl_check_table(path, table))
+ init_header(header, root, set, node, table, table_size);
+ if (sysctl_check_table(path, header))
goto fail;
spin_lock(&sysctl_lock);
@@ -1401,7 +1400,7 @@ fail:
}
/**
- * register_sysctl - register a sysctl table
+ * register_sysctl_sz - register a sysctl table
* @path: The path to the directory the sysctl table is in. If the path
* doesn't exist we will create it for you.
* @table: the table structure. The calller must ensure the life of the @table
@@ -1411,18 +1410,20 @@ fail:
* to call unregister_sysctl_table() and can instead use something like
* register_sysctl_init() which does not care for the result of the syctl
* registration.
+ * @table_size: The number of elements in table.
*
* Register a sysctl table. @table should be a filled in ctl_table
* array. A completely 0 filled entry terminates the table.
*
* See __register_sysctl_table for more details.
*/
-struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
+struct ctl_table_header *register_sysctl_sz(const char *path, struct ctl_table *table,
+ size_t table_size)
{
return __register_sysctl_table(&sysctl_table_root.default_set,
- path, table);
+ path, table, table_size);
}
-EXPORT_SYMBOL(register_sysctl);
+EXPORT_SYMBOL(register_sysctl_sz);
/**
* __register_sysctl_init() - register sysctl table to path
@@ -1433,6 +1434,7 @@ EXPORT_SYMBOL(register_sysctl);
* lifetime use of the sysctl.
* @table_name: The name of sysctl table, only used for log printing when
* registration fails
+ * @table_size: The number of elements in table
*
* The sysctl interface is used by userspace to query or modify at runtime
* a predefined value set on a variable. These variables however have default
@@ -1445,12 +1447,12 @@ EXPORT_SYMBOL(register_sysctl);
* Context: if your base directory does not exist it will be created for you.
*/
void __init __register_sysctl_init(const char *path, struct ctl_table *table,
- const char *table_name)
+ const char *table_name, size_t table_size)
{
- struct ctl_table_header *hdr = register_sysctl(path, table);
+ struct ctl_table_header *hdr = register_sysctl_sz(path, table, table_size);
if (unlikely(!hdr)) {
- pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+ pr_err("failed when register_sysctl_sz %s to %s\n", table_name, path);
return;
}
kmemleak_not_leak(hdr);
@@ -1471,7 +1473,7 @@ static void put_links(struct ctl_table_header *header)
if (IS_ERR(core_parent))
return;
- list_for_each_table_entry(entry, header->ctl_table) {
+ list_for_each_table_entry(entry, header) {
struct ctl_table_header *link_head;
struct ctl_table *link;
const char *name = entry->procname;
@@ -1535,7 +1537,7 @@ void setup_sysctl_set(struct ctl_table_set *set,
{
memset(set, 0, sizeof(*set));
set->is_seen = is_seen;
- init_header(&set->dir.header, root, set, NULL, root_table);
+ init_header(&set->dir.header, root, set, NULL, root_table, 1);
}
void retire_sysctl_set(struct ctl_table_set *set)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index a86e65a608da..9191248f2dac 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -314,7 +314,8 @@ static int proc_root_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
stat->nlink = proc_root.nlink + nr_processes();
return 0;
}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 72cd69bcaf4a..ecc4da8d265e 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 507cd4e59d07..3dd5be96691b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -4,6 +4,7 @@
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
+#include <linux/ksm.h>
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/ptrace.h>
@@ -236,21 +237,6 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
-/*
- * Indicate if the VMA is a stack for the given task; for
- * /proc/PID/maps that is the stack of the main task.
- */
-static int is_stack(struct vm_area_struct *vma)
-{
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= vma->vm_mm->start_stack &&
- vma->vm_end >= vma->vm_mm->start_stack;
-}
-
static void show_vma_header_prefix(struct seq_file *m,
unsigned long start, unsigned long end,
vm_flags_t flags, unsigned long long pgoff,
@@ -327,13 +313,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
- if (vma->vm_start <= mm->brk &&
- vma->vm_end >= mm->start_brk) {
+ if (vma_is_initial_heap(vma)) {
name = "[heap]";
goto done;
}
- if (is_stack(vma)) {
+ if (vma_is_initial_stack(vma)) {
name = "[stack]";
goto done;
}
@@ -412,6 +397,7 @@ struct mem_size_stats {
unsigned long swap;
unsigned long shared_hugetlb;
unsigned long private_hugetlb;
+ unsigned long ksm;
u64 pss;
u64 pss_anon;
u64 pss_file;
@@ -468,6 +454,9 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
mss->lazyfree += size;
}
+ if (PageKsm(page))
+ mss->ksm += size;
+
mss->resident += size;
/* Accumulate the size in pages that have been accessed. */
if (young || page_is_young(page) || PageReferenced(page))
@@ -587,8 +576,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
bool migration = false;
if (pmd_present(*pmd)) {
- /* FOLL_DUMP will return -EFAULT on huge zero page */
- page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
+ page = vm_normal_page_pmd(vma, addr, *pmd);
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
@@ -709,6 +697,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)] = "ui",
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
+#ifdef CONFIG_X86_USER_SHADOW_STACK
+ [ilog2(VM_SHADOW_STACK)] = "ss",
+#endif
};
size_t i;
@@ -758,12 +749,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops smaps_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
static const struct mm_walk_ops smaps_shmem_walk_ops = {
.pmd_entry = smaps_pte_range,
.hugetlb_entry = smaps_hugetlb_range,
.pte_hole = smaps_pte_hole,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -837,6 +830,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty);
SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced);
SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous);
+ SEQ_PUT_DEC(" kB\nKSM: ", mss->ksm);
SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
@@ -870,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
- seq_printf(m, "THPeligible: %d\n",
+ seq_printf(m, "THPeligible: %8u\n",
hugepage_vma_check(vma, vma->vm_flags, true, false, true));
if (arch_pkeys_enabled())
@@ -1245,6 +1239,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
static const struct mm_walk_ops clear_refs_walk_ops = {
.pmd_entry = clear_refs_pte_range,
.test_walk = clear_refs_test_walk,
+ .walk_lock = PGWALK_WRLOCK,
};
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
@@ -1622,6 +1617,7 @@ static const struct mm_walk_ops pagemap_ops = {
.pmd_entry = pagemap_pmd_range,
.pte_hole = pagemap_pte_hole,
.hugetlb_entry = pagemap_hugetlb_range,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1935,6 +1931,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
static const struct mm_walk_ops show_numa_ops = {
.hugetlb_entry = gather_hugetlb_stats,
.pmd_entry = gather_pte_stats,
+ .walk_lock = PGWALK_RDLOCK,
};
/*
@@ -1971,9 +1968,9 @@ static int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_puts(m, " file=");
seq_file_path(m, file, "\n\t= ");
- } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ } else if (vma_is_initial_heap(vma)) {
seq_puts(m, " heap");
- } else if (is_stack(vma)) {
+ } else if (vma_is_initial_stack(vma)) {
seq_puts(m, " stack");
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 2c8b62265981..7cebd397cc26 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static int is_stack(struct vm_area_struct *vma)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * We make no effort to guess what a given thread considers to be
- * its "stack". It's not even well-defined for programs written
- * languages like Go.
- */
- return vma->vm_start <= mm->start_stack &&
- vma->vm_end >= mm->start_stack;
-}
-
/*
* display a single VMA to a sequenced file
*/
@@ -171,7 +158,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
if (file) {
seq_pad(m, ' ');
seq_file_path(m, file, "");
- } else if (mm && is_stack(vma)) {
+ } else if (mm && vma_is_initial_stack(vma)) {
seq_pad(m, ' ');
seq_puts(m, "[stack]");
}
@@ -188,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p)
return nommu_vma_show(m, _p);
}
-static void *m_start(struct seq_file *m, loff_t *pos)
+static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
+ loff_t *ppos)
+{
+ struct vm_area_struct *vma = vma_next(&priv->iter);
+
+ if (vma) {
+ *ppos = vma->vm_start;
+ } else {
+ *ppos = -1UL;
+ }
+
+ return vma;
+}
+
+static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
+ unsigned long last_addr = *ppos;
struct mm_struct *mm;
- struct vm_area_struct *vma;
- unsigned long addr = *pos;
- /* See m_next(). Zero at the start or after lseek. */
- if (addr == -1UL)
+ /* See proc_get_vma(). Zero at the start or after lseek. */
+ if (last_addr == -1UL)
return NULL;
/* pin the task and mm whilst we play with them */
@@ -205,44 +205,41 @@ static void *m_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-ESRCH);
mm = priv->mm;
- if (!mm || !mmget_not_zero(mm))
+ if (!mm || !mmget_not_zero(mm)) {
+ put_task_struct(priv->task);
+ priv->task = NULL;
return NULL;
+ }
if (mmap_read_lock_killable(mm)) {
mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
return ERR_PTR(-EINTR);
}
- /* start the next element from addr */
- vma = find_vma(mm, addr);
- if (vma)
- return vma;
+ vma_iter_init(&priv->iter, mm, last_addr);
- mmap_read_unlock(mm);
- mmput(mm);
- return NULL;
+ return proc_get_vma(priv, ppos);
}
-static void m_stop(struct seq_file *m, void *_vml)
+static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
+ struct mm_struct *mm = priv->mm;
- if (!IS_ERR_OR_NULL(_vml)) {
- mmap_read_unlock(priv->mm);
- mmput(priv->mm);
- }
- if (priv->task) {
- put_task_struct(priv->task);
- priv->task = NULL;
- }
+ if (!priv->task)
+ return;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ put_task_struct(priv->task);
+ priv->task = NULL;
}
-static void *m_next(struct seq_file *m, void *_p, loff_t *pos)
+static void *m_next(struct seq_file *m, void *_p, loff_t *ppos)
{
- struct vm_area_struct *vma = _p;
-
- *pos = vma->vm_end;
- return find_vma(vma->vm_mm, vma->vm_end);
+ return proc_get_vma(m->private, ppos);
}
static const struct seq_operations proc_pid_maps_ops = {
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index a553273fbd41..63ac1f93289f 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s)
struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mode = S_IFLNK | S_IRWXUGO;
inode->i_uid = GLOBAL_ROOT_UID;
inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index c49d554cc9ae..3acc38600cd1 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
config PSTORE
tristate "Persistent store support"
- select CRYPTO if PSTORE_COMPRESS
default n
help
This option enables generic access to platform level
@@ -22,99 +21,18 @@ config PSTORE_DEFAULT_KMSG_BYTES
Defines default size of pstore kernel log storage.
Can be enlarged if needed, not recommended to shrink it.
-config PSTORE_DEFLATE_COMPRESS
- tristate "DEFLATE (ZLIB) compression"
- default y
- depends on PSTORE
- select CRYPTO_DEFLATE
- help
- This option enables DEFLATE (also known as ZLIB) compression
- algorithm support.
-
-config PSTORE_LZO_COMPRESS
- tristate "LZO compression"
- depends on PSTORE
- select CRYPTO_LZO
- help
- This option enables LZO compression algorithm support.
-
-config PSTORE_LZ4_COMPRESS
- tristate "LZ4 compression"
- depends on PSTORE
- select CRYPTO_LZ4
- help
- This option enables LZ4 compression algorithm support.
-
-config PSTORE_LZ4HC_COMPRESS
- tristate "LZ4HC compression"
- depends on PSTORE
- select CRYPTO_LZ4HC
- help
- This option enables LZ4HC (high compression) mode algorithm.
-
-config PSTORE_842_COMPRESS
- bool "842 compression"
- depends on PSTORE
- select CRYPTO_842
- help
- This option enables 842 compression algorithm support.
-
-config PSTORE_ZSTD_COMPRESS
- bool "zstd compression"
- depends on PSTORE
- select CRYPTO_ZSTD
- help
- This option enables zstd compression algorithm support.
-
config PSTORE_COMPRESS
- def_bool y
+ bool "Pstore compression (deflate)"
depends on PSTORE
- depends on PSTORE_DEFLATE_COMPRESS || PSTORE_LZO_COMPRESS || \
- PSTORE_LZ4_COMPRESS || PSTORE_LZ4HC_COMPRESS || \
- PSTORE_842_COMPRESS || PSTORE_ZSTD_COMPRESS
-
-choice
- prompt "Default pstore compression algorithm"
- depends on PSTORE_COMPRESS
+ select ZLIB_INFLATE
+ select ZLIB_DEFLATE
+ default y
help
- This option chooses the default active compression algorithm.
- This change be changed at boot with "pstore.compress=..." on
- the kernel command line.
-
- Currently, pstore has support for 6 compression algorithms:
- deflate, lzo, lz4, lz4hc, 842 and zstd.
-
- The default compression algorithm is deflate.
-
- config PSTORE_DEFLATE_COMPRESS_DEFAULT
- bool "deflate" if PSTORE_DEFLATE_COMPRESS
-
- config PSTORE_LZO_COMPRESS_DEFAULT
- bool "lzo" if PSTORE_LZO_COMPRESS
-
- config PSTORE_LZ4_COMPRESS_DEFAULT
- bool "lz4" if PSTORE_LZ4_COMPRESS
-
- config PSTORE_LZ4HC_COMPRESS_DEFAULT
- bool "lz4hc" if PSTORE_LZ4HC_COMPRESS
-
- config PSTORE_842_COMPRESS_DEFAULT
- bool "842" if PSTORE_842_COMPRESS
-
- config PSTORE_ZSTD_COMPRESS_DEFAULT
- bool "zstd" if PSTORE_ZSTD_COMPRESS
-
-endchoice
-
-config PSTORE_COMPRESS_DEFAULT
- string
- depends on PSTORE_COMPRESS
- default "deflate" if PSTORE_DEFLATE_COMPRESS_DEFAULT
- default "lzo" if PSTORE_LZO_COMPRESS_DEFAULT
- default "lz4" if PSTORE_LZ4_COMPRESS_DEFAULT
- default "lz4hc" if PSTORE_LZ4HC_COMPRESS_DEFAULT
- default "842" if PSTORE_842_COMPRESS_DEFAULT
- default "zstd" if PSTORE_ZSTD_COMPRESS_DEFAULT
+ Whether pstore records should be compressed before being written to
+ the backing store. This is implemented using the zlib 'deflate'
+ algorithm, using the library implementation instead of using the full
+ blown crypto API. This reduces the risk of secondary oopses or other
+ problems while pstore is recording panic metadata.
config PSTORE_CONSOLE
bool "Log kernel console messages"
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index ffbadb8b3032..585360706b33 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -54,7 +54,7 @@ static void free_pstore_private(struct pstore_private *private)
if (!private)
return;
if (private->record) {
- kfree(private->record->buf);
+ kvfree(private->record->buf);
kfree(private->record->priv);
kfree(private->record);
}
@@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
@@ -390,7 +390,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
inode->i_private = private;
if (record->time.tv_sec)
- inode->i_mtime = inode->i_ctime = record->time;
+ inode->i_mtime = inode_set_ctime_to_ts(inode, record->time);
d_add(dentry, inode);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index cbc0b468c1ab..e5bca9a004cc 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -14,24 +14,17 @@
#include <linux/init.h>
#include <linux/kmsg_dump.h>
#include <linux/console.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/pstore.h>
-#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
-#include <linux/lzo.h>
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
-#include <linux/lz4.h>
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
-#include <linux/zstd.h>
-#endif
-#include <linux/crypto.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/jiffies.h>
+#include <linux/vmalloc.h>
#include <linux/workqueue.h>
+#include <linux/zlib.h>
#include "internal.h"
@@ -80,12 +73,21 @@ static char *backend;
module_param(backend, charp, 0444);
MODULE_PARM_DESC(backend, "specific backend to use");
-static char *compress =
-#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT
- CONFIG_PSTORE_COMPRESS_DEFAULT;
-#else
- NULL;
-#endif
+/*
+ * pstore no longer implements compression via the crypto API, and only
+ * supports zlib deflate compression implemented using the zlib library
+ * interface. This removes additional complexity which is hard to justify for a
+ * diagnostic facility that has to operate in conditions where the system may
+ * have become unstable. Zlib deflate is comparatively small in terms of code
+ * size, and compresses ASCII text comparatively well. In terms of compression
+ * speed, deflate is not the best performer but for recording the log output on
+ * a kernel panic, this is not considered critical.
+ *
+ * The only remaining arguments supported by the compress= module parameter are
+ * 'deflate' and 'none'. To retain compatibility with existing installations,
+ * all other values are logged and replaced with 'deflate'.
+ */
+static char *compress = "deflate";
module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
@@ -94,16 +96,16 @@ unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
module_param(kmsg_bytes, ulong, 0444);
MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
-/* Compression parameters */
-static struct crypto_comp *tfm;
+static void *compress_workspace;
-struct pstore_zbackend {
- int (*zbufsize)(size_t size);
- const char *name;
-};
+/*
+ * Compression is only used for dmesg output, which consists of low-entropy
+ * ASCII text, and so we can assume worst-case 60%.
+ */
+#define DMESG_COMP_PERCENT 60
static char *big_oops_buf;
-static size_t big_oops_buf_sz;
+static size_t max_compressed_size;
void pstore_set_kmsg_bytes(int bytes)
{
@@ -168,206 +170,93 @@ static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
}
}
-#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
-static int zbufsize_deflate(size_t size)
-{
- size_t cmpr;
-
- switch (size) {
- /* buffer range for efivars */
- case 1000 ... 2000:
- cmpr = 56;
- break;
- case 2001 ... 3000:
- cmpr = 54;
- break;
- case 3001 ... 3999:
- cmpr = 52;
- break;
- /* buffer range for nvram, erst */
- case 4000 ... 10000:
- cmpr = 45;
- break;
- default:
- cmpr = 60;
- break;
- }
-
- return (size * 100) / cmpr;
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
-static int zbufsize_lzo(size_t size)
-{
- return lzo1x_worst_compress(size);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
-static int zbufsize_lz4(size_t size)
-{
- return LZ4_compressBound(size);
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
-static int zbufsize_842(size_t size)
-{
- return size;
-}
-#endif
-
-#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
-static int zbufsize_zstd(size_t size)
-{
- return zstd_compress_bound(size);
-}
-#endif
-
-static const struct pstore_zbackend *zbackend __ro_after_init;
-
-static const struct pstore_zbackend zbackends[] = {
-#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS)
- {
- .zbufsize = zbufsize_deflate,
- .name = "deflate",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS)
- {
- .zbufsize = zbufsize_lzo,
- .name = "lzo",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS)
- {
- .zbufsize = zbufsize_lz4,
- .name = "lz4",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS)
- {
- .zbufsize = zbufsize_lz4,
- .name = "lz4hc",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS)
- {
- .zbufsize = zbufsize_842,
- .name = "842",
- },
-#endif
-#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS)
- {
- .zbufsize = zbufsize_zstd,
- .name = "zstd",
- },
-#endif
- { }
-};
-
static int pstore_compress(const void *in, void *out,
unsigned int inlen, unsigned int outlen)
{
+ struct z_stream_s zstream = {
+ .next_in = in,
+ .avail_in = inlen,
+ .next_out = out,
+ .avail_out = outlen,
+ .workspace = compress_workspace,
+ };
int ret;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS))
return -EINVAL;
- ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
- if (ret) {
- pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
- return ret;
- }
+ ret = zlib_deflateInit2(&zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+ -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+ if (ret != Z_OK)
+ return -EINVAL;
- return outlen;
+ ret = zlib_deflate(&zstream, Z_FINISH);
+ if (ret != Z_STREAM_END)
+ return -EINVAL;
+
+ ret = zlib_deflateEnd(&zstream);
+ if (ret != Z_OK)
+ pr_warn_once("zlib_deflateEnd() failed: %d\n", ret);
+
+ return zstream.total_out;
}
static void allocate_buf_for_compression(void)
{
- struct crypto_comp *ctx;
- int size;
+ size_t compressed_size;
char *buf;
- /* Skip if not built-in or compression backend not selected yet. */
- if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend)
- return;
-
- /* Skip if no pstore backend yet or compression init already done. */
- if (!psinfo || tfm)
- return;
-
- if (!crypto_has_comp(zbackend->name, 0, 0)) {
- pr_err("Unknown compression: %s\n", zbackend->name);
+ /* Skip if not built-in or compression disabled. */
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !compress ||
+ !strcmp(compress, "none")) {
+ compress = NULL;
return;
}
- size = zbackend->zbufsize(psinfo->bufsize);
- if (size <= 0) {
- pr_err("Invalid compression size for %s: %d\n",
- zbackend->name, size);
- return;
+ if (strcmp(compress, "deflate")) {
+ pr_err("Unsupported compression '%s', falling back to deflate\n",
+ compress);
+ compress = "deflate";
}
- buf = kmalloc(size, GFP_KERNEL);
+ /*
+ * The compression buffer only needs to be as large as the maximum
+ * uncompressed record size, since any record that would be expanded by
+ * compression is just stored uncompressed.
+ */
+ compressed_size = (psinfo->bufsize * 100) / DMESG_COMP_PERCENT;
+ buf = kvzalloc(compressed_size, GFP_KERNEL);
if (!buf) {
- pr_err("Failed %d byte compression buffer allocation for: %s\n",
- size, zbackend->name);
+ pr_err("Failed %zu byte compression buffer allocation for: %s\n",
+ psinfo->bufsize, compress);
return;
}
- ctx = crypto_alloc_comp(zbackend->name, 0, 0);
- if (IS_ERR_OR_NULL(ctx)) {
- kfree(buf);
- pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
- PTR_ERR(ctx));
+ compress_workspace =
+ vmalloc(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL));
+ if (!compress_workspace) {
+ pr_err("Failed to allocate zlib deflate workspace\n");
+ kvfree(buf);
return;
}
/* A non-NULL big_oops_buf indicates compression is available. */
- tfm = ctx;
- big_oops_buf_sz = size;
big_oops_buf = buf;
+ max_compressed_size = compressed_size;
- pr_info("Using crash dump compression: %s\n", zbackend->name);
+ pr_info("Using crash dump compression: %s\n", compress);
}
static void free_buf_for_compression(void)
{
- if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) {
- crypto_free_comp(tfm);
- tfm = NULL;
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress_workspace) {
+ vfree(compress_workspace);
+ compress_workspace = NULL;
}
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- big_oops_buf_sz = 0;
-}
-/*
- * Called when compression fails, since the printk buffer
- * would be fetched for compression calling it again when
- * compression fails would have moved the iterator of
- * printk buffer which results in fetching old contents.
- * Copy the recent messages from big_oops_buf to psinfo->buf
- */
-static size_t copy_kmsg_to_buffer(int hsize, size_t len)
-{
- size_t total_len;
- size_t diff;
-
- total_len = hsize + len;
-
- if (total_len > psinfo->bufsize) {
- diff = total_len - psinfo->bufsize + hsize;
- memcpy(psinfo->buf, big_oops_buf, hsize);
- memcpy(psinfo->buf + hsize, big_oops_buf + diff,
- psinfo->bufsize - hsize);
- total_len = psinfo->bufsize;
- } else
- memcpy(psinfo->buf, big_oops_buf, total_len);
-
- return total_len;
+ kvfree(big_oops_buf);
+ big_oops_buf = NULL;
+ max_compressed_size = 0;
}
void pstore_record_init(struct pstore_record *record,
@@ -426,13 +315,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
record.part = part;
record.buf = psinfo->buf;
- if (big_oops_buf) {
- dst = big_oops_buf;
- dst_size = big_oops_buf_sz;
- } else {
- dst = psinfo->buf;
- dst_size = psinfo->bufsize;
- }
+ dst = big_oops_buf ?: psinfo->buf;
+ dst_size = max_compressed_size ?: psinfo->bufsize;
/* Write dump header. */
header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why,
@@ -453,8 +337,15 @@ static void pstore_dump(struct kmsg_dumper *dumper,
record.compressed = true;
record.size = zipped_len;
} else {
- record.size = copy_kmsg_to_buffer(header_size,
- dump_size);
+ /*
+ * Compression failed, so the buffer is most
+ * likely filled with binary data that does not
+ * compress as well as ASCII text. Copy as much
+ * of the uncompressed data as possible into
+ * the pstore record, and discard the rest.
+ */
+ record.size = psinfo->bufsize;
+ memcpy(psinfo->buf, dst, psinfo->bufsize);
}
} else {
record.size = header_size + dump_size;
@@ -549,7 +440,7 @@ static int pstore_write_user_compat(struct pstore_record *record,
if (record->buf)
return -EINVAL;
- record->buf = memdup_user(buf, record->size);
+ record->buf = vmemdup_user(buf, record->size);
if (IS_ERR(record->buf)) {
ret = PTR_ERR(record->buf);
goto out;
@@ -557,7 +448,7 @@ static int pstore_write_user_compat(struct pstore_record *record,
ret = record->psi->write(record);
- kfree(record->buf);
+ kvfree(record->buf);
out:
record->buf = NULL;
@@ -681,11 +572,13 @@ void pstore_unregister(struct pstore_info *psi)
}
EXPORT_SYMBOL_GPL(pstore_unregister);
-static void decompress_record(struct pstore_record *record)
+static void decompress_record(struct pstore_record *record,
+ struct z_stream_s *zstream)
{
int ret;
int unzipped_len;
char *unzipped, *workspace;
+ size_t max_uncompressed_size;
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !record->compressed)
return;
@@ -697,40 +590,51 @@ static void decompress_record(struct pstore_record *record)
}
/* Missing compression buffer means compression was not initialized. */
- if (!big_oops_buf) {
+ if (!zstream->workspace) {
pr_warn("no decompression method initialized!\n");
return;
}
+ ret = zlib_inflateReset(zstream);
+ if (ret != Z_OK) {
+ pr_err("zlib_inflateReset() failed, ret = %d!\n", ret);
+ return;
+ }
+
/* Allocate enough space to hold max decompression and ECC. */
- unzipped_len = big_oops_buf_sz;
- workspace = kmalloc(unzipped_len + record->ecc_notice_size,
- GFP_KERNEL);
+ max_uncompressed_size = 3 * psinfo->bufsize;
+ workspace = kvzalloc(max_uncompressed_size + record->ecc_notice_size,
+ GFP_KERNEL);
if (!workspace)
return;
- /* After decompression "unzipped_len" is almost certainly smaller. */
- ret = crypto_comp_decompress(tfm, record->buf, record->size,
- workspace, &unzipped_len);
- if (ret) {
- pr_err("crypto_comp_decompress failed, ret = %d!\n", ret);
- kfree(workspace);
+ zstream->next_in = record->buf;
+ zstream->avail_in = record->size;
+ zstream->next_out = workspace;
+ zstream->avail_out = max_uncompressed_size;
+
+ ret = zlib_inflate(zstream, Z_FINISH);
+ if (ret != Z_STREAM_END) {
+ pr_err_ratelimited("zlib_inflate() failed, ret = %d!\n", ret);
+ kvfree(workspace);
return;
}
+ unzipped_len = zstream->total_out;
+
/* Append ECC notice to decompressed buffer. */
memcpy(workspace + unzipped_len, record->buf + record->size,
record->ecc_notice_size);
/* Copy decompressed contents into an minimum-sized allocation. */
- unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size,
- GFP_KERNEL);
- kfree(workspace);
+ unzipped = kvmemdup(workspace, unzipped_len + record->ecc_notice_size,
+ GFP_KERNEL);
+ kvfree(workspace);
if (!unzipped)
return;
/* Swap out compressed contents with decompressed contents. */
- kfree(record->buf);
+ kvfree(record->buf);
record->buf = unzipped;
record->size = unzipped_len;
record->compressed = false;
@@ -747,10 +651,17 @@ void pstore_get_backend_records(struct pstore_info *psi,
{
int failed = 0;
unsigned int stop_loop = 65536;
+ struct z_stream_s zstream = {};
if (!psi || !root)
return;
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) {
+ zstream.workspace = kvmalloc(zlib_inflate_workspacesize(),
+ GFP_KERNEL);
+ zlib_inflateInit2(&zstream, -DEF_WBITS);
+ }
+
mutex_lock(&psi->read_mutex);
if (psi->open && psi->open(psi))
goto out;
@@ -779,11 +690,11 @@ void pstore_get_backend_records(struct pstore_info *psi,
break;
}
- decompress_record(record);
+ decompress_record(record, &zstream);
rc = pstore_mkfile(root, record);
if (rc) {
/* pstore_mkfile() did not take record, so free it. */
- kfree(record->buf);
+ kvfree(record->buf);
kfree(record->priv);
kfree(record);
if (rc != -EEXIST || !quiet)
@@ -795,6 +706,12 @@ void pstore_get_backend_records(struct pstore_info *psi,
out:
mutex_unlock(&psi->read_mutex);
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) {
+ if (zlib_inflateEnd(&zstream) != Z_OK)
+ pr_warn("zlib_inflateEnd() failed\n");
+ kvfree(zstream.workspace);
+ }
+
if (failed)
pr_warn("failed to create %d record(s) from '%s'\n",
failed, psi->name);
@@ -818,34 +735,10 @@ static void pstore_timefunc(struct timer_list *unused)
pstore_timer_kick();
}
-static void __init pstore_choose_compression(void)
-{
- const struct pstore_zbackend *step;
-
- if (!compress)
- return;
-
- for (step = zbackends; step->name; step++) {
- if (!strcmp(compress, step->name)) {
- zbackend = step;
- return;
- }
- }
-}
-
static int __init pstore_init(void)
{
int ret;
- pstore_choose_compression();
-
- /*
- * Check if any pstore backends registered earlier but did not
- * initialize compression because crypto was not ready. If so,
- * initialize compression now.
- */
- allocate_buf_for_compression();
-
ret = pstore_init_fs();
if (ret)
free_buf_for_compression();
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 2f625e1fa8d8..d36702c7ab3c 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -20,6 +20,7 @@
#include <linux/compiler.h>
#include <linux/of.h>
#include <linux/of_address.h>
+#include <linux/mm.h>
#include "internal.h"
#include "ram_internal.h"
@@ -268,7 +269,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
/* ECC correction notice */
record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
- record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
+ record->buf = kvzalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
if (record->buf == NULL) {
size = -ENOMEM;
goto out;
@@ -282,7 +283,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record)
out:
if (free_prz) {
- kfree(prz->old_log);
+ kvfree(prz->old_log);
kfree(prz);
}
@@ -833,7 +834,7 @@ static int ramoops_probe(struct platform_device *pdev)
*/
if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) {
cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size;
- cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL);
+ cxt->pstore.buf = kvzalloc(cxt->pstore.bufsize, GFP_KERNEL);
if (!cxt->pstore.buf) {
pr_err("cannot allocate pstore crash dump buffer\n");
err = -ENOMEM;
@@ -866,7 +867,7 @@ static int ramoops_probe(struct platform_device *pdev)
return 0;
fail_buf:
- kfree(cxt->pstore.buf);
+ kvfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
fail_init:
@@ -881,7 +882,7 @@ static void ramoops_remove(struct platform_device *pdev)
pstore_unregister(&cxt->pstore);
- kfree(cxt->pstore.buf);
+ kvfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
ramoops_free_przs(cxt);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 85aaf0fc6d7d..650e437b55e6 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
+#include <linux/mm.h>
#include <asm/page.h>
#include "ram_internal.h"
@@ -24,12 +25,10 @@
/**
* struct persistent_ram_buffer - persistent circular RAM buffer
*
- * @sig:
- * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value)
- * @start:
- * offset into @data where the beginning of the stored bytes begin
- * @size:
- * number of valid bytes stored in @data
+ * @sig: Signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value)
+ * @start: First valid byte in the buffer.
+ * @size: Number of valid bytes in the buffer.
+ * @data: The contents of the buffer.
*/
struct persistent_ram_buffer {
uint32_t sig;
@@ -301,7 +300,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz)
if (!prz->old_log) {
persistent_ram_ecc_old(prz);
- prz->old_log = kmalloc(size, GFP_KERNEL);
+ prz->old_log = kvzalloc(size, GFP_KERNEL);
}
if (!prz->old_log) {
pr_err("failed to allocate buffer\n");
@@ -385,7 +384,7 @@ void *persistent_ram_old(struct persistent_ram_zone *prz)
void persistent_ram_free_old(struct persistent_ram_zone *prz)
{
- kfree(prz->old_log);
+ kvfree(prz->old_log);
prz->old_log = NULL;
prz->old_log_size = 0;
}
@@ -519,7 +518,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
sig ^= PERSISTENT_RAM_SIG;
if (prz->buffer->sig == sig) {
- if (buffer_size(prz) == 0) {
+ if (buffer_size(prz) == 0 && buffer_start(prz) == 0) {
pr_debug("found existing empty buffer\n");
return 0;
}
diff --git a/fs/qnx4/Kconfig b/fs/qnx4/Kconfig
index 45b5b98376c4..a2eb826e76c6 100644
--- a/fs/qnx4/Kconfig
+++ b/fs/qnx4/Kconfig
@@ -2,6 +2,7 @@
config QNX4FS_FS
tristate "QNX4 file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
This is the file system used by the real-time operating systems
QNX 4 and QNX 6 (the latter is also called QNX RTP).
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 391ea402920d..a7171f5532a1 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -305,8 +305,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime);
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0);
inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size);
memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE);
diff --git a/fs/qnx6/Kconfig b/fs/qnx6/Kconfig
index 6a9d6bce1586..8e865d72204e 100644
--- a/fs/qnx6/Kconfig
+++ b/fs/qnx6/Kconfig
@@ -2,6 +2,7 @@
config QNX6FS_FS
tristate "QNX6 file system support (read only)"
depends on BLOCK && CRC32
+ select BUFFER_HEAD
help
This is the file system used by the real-time operating systems
QNX 6 (also called QNX RTP).
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 85b2fa3b211c..21f90d519f1a 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -562,8 +562,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime);
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_ctime);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0);
/* calc blocks based on 512 byte blocksize */
inode->i_blocks = (inode->i_size + 511) >> 9;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index e3e4f4047657..31e897ad5e6a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -225,18 +225,26 @@ static void put_quota_format(struct quota_format_type *fmt)
/*
* Dquot List Management:
- * The quota code uses four lists for dquot management: the inuse_list,
- * free_dquots, dqi_dirty_list, and dquot_hash[] array. A single dquot
- * structure may be on some of those lists, depending on its current state.
+ * The quota code uses five lists for dquot management: the inuse_list,
+ * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array.
+ * A single dquot structure may be on some of those lists, depending on
+ * its current state.
*
* All dquots are placed to the end of inuse_list when first created, and this
* list is used for invalidate operation, which must look at every dquot.
*
- * Unused dquots (dq_count == 0) are added to the free_dquots list when freed,
- * and this list is searched whenever we need an available dquot. Dquots are
- * removed from the list as soon as they are used again, and
- * dqstats.free_dquots gives the number of dquots on the list. When
- * dquot is invalidated it's completely released from memory.
+ * When the last reference of a dquot is dropped, the dquot is added to
+ * releasing_dquots. We'll then queue work item which will call
+ * synchronize_srcu() and after that perform the final cleanup of all the
+ * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
+ * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
+ * struct.
+ *
+ * Unused and cleaned up dquots are in the free_dquots list and this list is
+ * searched whenever we need an available dquot. Dquots are removed from the
+ * list as soon as they are used again and dqstats.free_dquots gives the number
+ * of dquots on the list. When dquot is invalidated it's completely released
+ * from memory.
*
* Dirty dquots are added to the dqi_dirty_list of quota_info when mark
* dirtied, and this list is searched when writing dirty dquots back to
@@ -250,6 +258,7 @@ static void put_quota_format(struct quota_format_type *fmt)
static LIST_HEAD(inuse_list);
static LIST_HEAD(free_dquots);
+static LIST_HEAD(releasing_dquots);
static unsigned int dq_hash_bits, dq_hash_mask;
static struct hlist_head *dquot_hash;
@@ -260,6 +269,9 @@ static qsize_t inode_get_rsv_space(struct inode *inode);
static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);
+static void quota_release_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn);
+
static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
{
@@ -305,12 +317,21 @@ static inline void put_dquot_last(struct dquot *dquot)
dqstats_inc(DQST_FREE_DQUOTS);
}
+static inline void put_releasing_dquots(struct dquot *dquot)
+{
+ list_add_tail(&dquot->dq_free, &releasing_dquots);
+ set_bit(DQ_RELEASING_B, &dquot->dq_flags);
+}
+
static inline void remove_free_dquot(struct dquot *dquot)
{
if (list_empty(&dquot->dq_free))
return;
list_del_init(&dquot->dq_free);
- dqstats_dec(DQST_FREE_DQUOTS);
+ if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
+ dqstats_dec(DQST_FREE_DQUOTS);
+ else
+ clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}
static inline void put_inuse(struct dquot *dquot)
@@ -336,6 +357,11 @@ static void wait_on_dquot(struct dquot *dquot)
mutex_unlock(&dquot->dq_lock);
}
+static inline int dquot_active(struct dquot *dquot)
+{
+ return test_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+}
+
static inline int dquot_dirty(struct dquot *dquot)
{
return test_bit(DQ_MOD_B, &dquot->dq_flags);
@@ -351,14 +377,14 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
{
int ret = 1;
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ if (!dquot_active(dquot))
return 0;
if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);
/* If quota is dirty already, we don't have to acquire dq_list_lock */
- if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+ if (dquot_dirty(dquot))
return 1;
spin_lock(&dq_list_lock);
@@ -440,7 +466,7 @@ int dquot_acquire(struct dquot *dquot)
smp_mb__before_atomic();
set_bit(DQ_READ_B, &dquot->dq_flags);
/* Instantiate dquot if needed */
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
+ if (!dquot_active(dquot) && !dquot->dq_off) {
ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
/* Write the info if needed */
if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
@@ -482,7 +508,7 @@ int dquot_commit(struct dquot *dquot)
goto out_lock;
/* Inactive dquot can be only if there was error during read/init
* => we have better not writing it */
- if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ if (dquot_active(dquot))
ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
else
ret = -EIO;
@@ -547,6 +573,8 @@ static void invalidate_dquots(struct super_block *sb, int type)
struct dquot *dquot, *tmp;
restart:
+ flush_delayed_work(&quota_release_work);
+
spin_lock(&dq_list_lock);
list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
if (dquot->dq_sb != sb)
@@ -574,6 +602,15 @@ restart:
goto restart;
}
/*
+ * The last user already dropped its reference but dquot didn't
+ * get fully cleaned up yet. Restart the scan which flushes the
+ * work cleaning up released dquots.
+ */
+ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+ spin_unlock(&dq_list_lock);
+ goto restart;
+ }
+ /*
* Quota now has no users and it has been written on last
* dqput()
*/
@@ -597,7 +634,7 @@ int dquot_scan_active(struct super_block *sb,
spin_lock(&dq_list_lock);
list_for_each_entry(dquot, &inuse_list, dq_inuse) {
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ if (!dquot_active(dquot))
continue;
if (dquot->dq_sb != sb)
continue;
@@ -612,7 +649,7 @@ int dquot_scan_active(struct super_block *sb,
* outstanding call and recheck the DQ_ACTIVE_B after that.
*/
wait_on_dquot(dquot);
- if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+ if (dquot_active(dquot)) {
ret = fn(dquot, priv);
if (ret < 0)
goto out;
@@ -628,6 +665,18 @@ out:
}
EXPORT_SYMBOL(dquot_scan_active);
+static inline int dquot_write_dquot(struct dquot *dquot)
+{
+ int ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+ if (ret < 0) {
+ quota_error(dquot->dq_sb, "Can't write quota structure "
+ "(error %d). Quota may get out of sync!", ret);
+ /* Clear dirty bit anyway to avoid infinite loop. */
+ clear_dquot_dirty(dquot);
+ }
+ return ret;
+}
+
/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
@@ -651,23 +700,23 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
dquot = list_first_entry(&dirty, struct dquot,
dq_dirty);
- WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
+ WARN_ON(!dquot_active(dquot));
+ /* If the dquot is releasing we should not touch it */
+ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+ spin_unlock(&dq_list_lock);
+ flush_delayed_work(&quota_release_work);
+ spin_lock(&dq_list_lock);
+ continue;
+ }
/* Now we have active dquot from which someone is
* holding reference so we can safely just increase
* use count */
dqgrab(dquot);
spin_unlock(&dq_list_lock);
- err = sb->dq_op->write_dquot(dquot);
- if (err) {
- /*
- * Clear dirty bit anyway to avoid infinite
- * loop here.
- */
- clear_dquot_dirty(dquot);
- if (!ret)
- ret = err;
- }
+ err = dquot_write_dquot(dquot);
+ if (err && !ret)
+ ret = err;
dqput(dquot);
spin_lock(&dq_list_lock);
}
@@ -761,12 +810,52 @@ static struct shrinker dqcache_shrinker = {
};
/*
+ * Safely release dquot and put reference to dquot.
+ */
+static void quota_release_workfn(struct work_struct *work)
+{
+ struct dquot *dquot;
+ struct list_head rls_head;
+
+ spin_lock(&dq_list_lock);
+ /* Exchange the list head to avoid livelock. */
+ list_replace_init(&releasing_dquots, &rls_head);
+ spin_unlock(&dq_list_lock);
+ synchronize_srcu(&dquot_srcu);
+
+restart:
+ spin_lock(&dq_list_lock);
+ while (!list_empty(&rls_head)) {
+ dquot = list_first_entry(&rls_head, struct dquot, dq_free);
+ WARN_ON_ONCE(atomic_read(&dquot->dq_count));
+ /*
+ * Note that DQ_RELEASING_B protects us from racing with
+ * invalidate_dquots() calls so we are safe to work with the
+ * dquot even after we drop dq_list_lock.
+ */
+ if (dquot_dirty(dquot)) {
+ spin_unlock(&dq_list_lock);
+ /* Commit dquot before releasing */
+ dquot_write_dquot(dquot);
+ goto restart;
+ }
+ if (dquot_active(dquot)) {
+ spin_unlock(&dq_list_lock);
+ dquot->dq_sb->dq_op->release_dquot(dquot);
+ goto restart;
+ }
+ /* Dquot is inactive and clean, now move it to free list */
+ remove_free_dquot(dquot);
+ put_dquot_last(dquot);
+ }
+ spin_unlock(&dq_list_lock);
+}
+
+/*
* Put reference to dquot
*/
void dqput(struct dquot *dquot)
{
- int ret;
-
if (!dquot)
return;
#ifdef CONFIG_QUOTA_DEBUG
@@ -778,7 +867,7 @@ void dqput(struct dquot *dquot)
}
#endif
dqstats_inc(DQST_DROPS);
-we_slept:
+
spin_lock(&dq_list_lock);
if (atomic_read(&dquot->dq_count) > 1) {
/* We have more than one user... nothing to do */
@@ -790,35 +879,16 @@ we_slept:
spin_unlock(&dq_list_lock);
return;
}
+
/* Need to release dquot? */
- if (dquot_dirty(dquot)) {
- spin_unlock(&dq_list_lock);
- /* Commit dquot before releasing */
- ret = dquot->dq_sb->dq_op->write_dquot(dquot);
- if (ret < 0) {
- quota_error(dquot->dq_sb, "Can't write quota structure"
- " (error %d). Quota may get out of sync!",
- ret);
- /*
- * We clear dirty bit anyway, so that we avoid
- * infinite loop here
- */
- clear_dquot_dirty(dquot);
- }
- goto we_slept;
- }
- if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
- spin_unlock(&dq_list_lock);
- dquot->dq_sb->dq_op->release_dquot(dquot);
- goto we_slept;
- }
- atomic_dec(&dquot->dq_count);
#ifdef CONFIG_QUOTA_DEBUG
/* sanity check */
BUG_ON(!list_empty(&dquot->dq_free));
#endif
- put_dquot_last(dquot);
+ put_releasing_dquots(dquot);
+ atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
+ queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -905,10 +975,10 @@ we_slept:
dqstats_inc(DQST_LOOKUPS);
}
/* Wait for dq_lock - after this we know that either dquot_release() is
- * already finished or it will be canceled due to dq_count > 1 test */
+ * already finished or it will be canceled due to dq_count > 0 test */
wait_on_dquot(dquot);
/* Read the dquot / allocate space in quota file */
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+ if (!dquot_active(dquot)) {
int err;
err = sb->dq_op->acquire_dquot(dquot);
@@ -1014,59 +1084,7 @@ out:
return err;
}
-/*
- * Remove references to dquots from inode and add dquot to list for freeing
- * if we have the last reference to dquot
- */
-static void remove_inode_dquot_ref(struct inode *inode, int type,
- struct list_head *tofree_head)
-{
- struct dquot **dquots = i_dquot(inode);
- struct dquot *dquot = dquots[type];
-
- if (!dquot)
- return;
-
- dquots[type] = NULL;
- if (list_empty(&dquot->dq_free)) {
- /*
- * The inode still has reference to dquot so it can't be in the
- * free list
- */
- spin_lock(&dq_list_lock);
- list_add(&dquot->dq_free, tofree_head);
- spin_unlock(&dq_list_lock);
- } else {
- /*
- * Dquot is already in a list to put so we won't drop the last
- * reference here.
- */
- dqput(dquot);
- }
-}
-
-/*
- * Free list of dquots
- * Dquots are removed from inodes and no new references can be got so we are
- * the only ones holding reference
- */
-static void put_dquot_list(struct list_head *tofree_head)
-{
- struct list_head *act_head;
- struct dquot *dquot;
-
- act_head = tofree_head->next;
- while (act_head != tofree_head) {
- dquot = list_entry(act_head, struct dquot, dq_free);
- act_head = act_head->next;
- /* Remove dquot from the list so we won't have problems... */
- list_del_init(&dquot->dq_free);
- dqput(dquot);
- }
-}
-
-static void remove_dquot_ref(struct super_block *sb, int type,
- struct list_head *tofree_head)
+static void remove_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode;
#ifdef CONFIG_QUOTA_DEBUG
@@ -1083,11 +1101,16 @@ static void remove_dquot_ref(struct super_block *sb, int type,
*/
spin_lock(&dq_data_lock);
if (!IS_NOQUOTA(inode)) {
+ struct dquot **dquots = i_dquot(inode);
+ struct dquot *dquot = dquots[type];
+
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
reserved = 1;
#endif
- remove_inode_dquot_ref(inode, type, tofree_head);
+ dquots[type] = NULL;
+ if (dquot)
+ dqput(dquot);
}
spin_unlock(&dq_data_lock);
}
@@ -1104,13 +1127,8 @@ static void remove_dquot_ref(struct super_block *sb, int type,
/* Gather all references from inodes and drop them */
static void drop_dquot_ref(struct super_block *sb, int type)
{
- LIST_HEAD(tofree_head);
-
- if (sb->dq_op) {
- remove_dquot_ref(sb, type, &tofree_head);
- synchronize_srcu(&dquot_srcu);
- put_dquot_list(&tofree_head);
- }
+ if (sb->dq_op)
+ remove_dquot_ref(sb, type);
}
static inline
@@ -1425,7 +1443,7 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
return QUOTA_NL_NOWARN;
}
-static int dquot_active(const struct inode *inode)
+static int inode_quota_active(const struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -1448,7 +1466,7 @@ static int __dquot_initialize(struct inode *inode, int type)
qsize_t rsv;
int ret = 0;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return 0;
dquots = i_dquot(inode);
@@ -1556,7 +1574,7 @@ bool dquot_initialize_needed(struct inode *inode)
struct dquot **dquots;
int i;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return false;
dquots = i_dquot(inode);
@@ -1667,7 +1685,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
int reserve = flags & DQUOT_SPACE_RESERVE;
struct dquot **dquots;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
if (reserve) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) += number;
@@ -1737,7 +1755,7 @@ int dquot_alloc_inode(struct inode *inode)
struct dquot_warn warn[MAXQUOTAS];
struct dquot * const *dquots;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warn[cnt].w_type = QUOTA_NL_NOWARN;
@@ -1780,7 +1798,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
struct dquot **dquots;
int cnt, index;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
__inode_add_bytes(inode, number);
@@ -1822,7 +1840,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
struct dquot **dquots;
int cnt, index;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) += number;
__inode_sub_bytes(inode, number);
@@ -1866,7 +1884,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
struct dquot **dquots;
int reserve = flags & DQUOT_SPACE_RESERVE, index;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
if (reserve) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
@@ -1921,7 +1939,7 @@ void dquot_free_inode(struct inode *inode)
struct dquot * const *dquots;
int index;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return;
dquots = i_dquot(inode);
@@ -2093,7 +2111,7 @@ int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode,
struct super_block *sb = inode->i_sb;
int ret;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return 0;
if (i_uid_needs_update(idmap, iattr, inode)) {
@@ -2359,15 +2377,14 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
struct quota_info *dqopt = sb_dqopt(sb);
int error;
+ lockdep_assert_held_write(&sb->s_umount);
+
/* Just unsuspend quotas? */
BUG_ON(flags & DQUOT_SUSPENDED);
- /* s_umount should be held in exclusive mode */
- if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
- up_read(&sb->s_umount);
if (!fmt)
return -ESRCH;
- if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
+ if (!sb->dq_op || !sb->s_qcop ||
(type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
error = -EINVAL;
goto out_fmt;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index fef477c78107..18e8387cab41 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode->i_mapping->a_ops = &ram_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
@@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
error = 0;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
}
return error;
}
@@ -138,7 +138,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
if (!error) {
d_instantiate(dentry, inode);
dget(dentry);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
} else
iput(inode);
}
diff --git a/fs/read_write.c b/fs/read_write.c
index b07de77ef126..4771701c896b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(vfs_setpos);
* @file: file structure to seek on
* @offset: file offset to seek to
* @whence: type of seek
- * @size: max size of this file in file system
+ * @maxsize: max size of this file in file system
* @eof: offset used for SEEK_END position
*
* This is a variant of generic_file_llseek that allows passing in a custom
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index 4d22ecfe0fab..0e6fe26458fe 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config REISERFS_FS
tristate "Reiserfs support (deprecated)"
+ select BUFFER_HEAD
select CRC32
select LEGACY_DIRECT_IO
help
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index fefe87e1c099..6c13a8d9a73c 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2252,8 +2252,9 @@ static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
return sizeof(struct virtual_node) +
max(max_num_of_items * sizeof(struct virtual_item),
- sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
- (max_num_of_entries - 1) * sizeof(__u16));
+ sizeof(struct virtual_item) +
+ struct_size_t(struct direntry_uarea, entry_sizes,
+ max_num_of_entries));
}
/*
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 77bd3b27059f..86e55d4bb10d 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1259,9 +1259,8 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode->i_size = sd_v1_size(sd);
inode->i_atime.tv_sec = sd_v1_atime(sd);
inode->i_mtime.tv_sec = sd_v1_mtime(sd);
- inode->i_ctime.tv_sec = sd_v1_ctime(sd);
+ inode_set_ctime(inode, sd_v1_ctime(sd), 0);
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
inode->i_blocks = sd_v1_blocks(sd);
@@ -1314,8 +1313,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
i_gid_write(inode, sd_v2_gid(sd));
inode->i_mtime.tv_sec = sd_v2_mtime(sd);
inode->i_atime.tv_sec = sd_v2_atime(sd);
- inode->i_ctime.tv_sec = sd_v2_ctime(sd);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, sd_v2_ctime(sd), 0);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
inode->i_blocks = sd_v2_blocks(sd);
@@ -1374,7 +1372,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_gid(sd_v2, i_gid_read(inode));
set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
- set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
+ set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec);
set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
@@ -1394,7 +1392,7 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
set_sd_v1_nlink(sd_v1, inode->i_nlink);
set_sd_v1_size(sd_v1, size);
set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
- set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
+ set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec);
set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
@@ -1986,7 +1984,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
/* uid and gid must already be set by the caller for quota init */
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_size = i_size;
inode->i_blocks = 0;
inode->i_bytes = 0;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 6bf9b54e58ca..dd33f8cc6eda 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -55,7 +55,7 @@ int reiserfs_fileattr_set(struct mnt_idmap *idmap,
}
sd_attrs_to_i_attrs(flags, inode);
REISERFS_I(inode)->i_attrs = flags;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
err = 0;
unlock:
@@ -107,7 +107,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = -EFAULT;
goto setversion_out;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
setversion_out:
mnt_drop_write_file(filp);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 479aa4a57602..015bfe4e4524 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2326,7 +2326,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
int i, j;
bh = __getblk(dev, block, bufsize);
- if (buffer_uptodate(bh))
+ if (!bh || buffer_uptodate(bh))
return (bh);
if (block + BUFNR > max_block) {
@@ -2336,6 +2336,8 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
j = 1;
for (i = 1; i < blocks; i++) {
bh = __getblk(dev, block + i, bufsize);
+ if (!bh)
+ break;
if (buffer_uptodate(bh)) {
brelse(bh);
break;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 52240cc891cf..9c5704be2435 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
}
dir->i_size += paste_size;
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
if (!S_ISDIR(inode->i_mode) && visible)
/* reiserfs_mkdir or reiserfs_rename will do that by itself */
reiserfs_update_sd(th, dir);
@@ -966,7 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_nlink);
clear_nlink(inode);
- inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
reiserfs_update_sd(&th, inode);
DEC_DIR_INODE_NLINK(dir)
@@ -1070,11 +1071,11 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
inc_nlink(inode);
goto end_unlink;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
reiserfs_update_sd(&th, inode);
dir->i_size -= (de.de_entrylen + DEH_SIZE);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
reiserfs_update_sd(&th, dir);
if (!savelink)
@@ -1250,7 +1251,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
return err ? err : retval;
}
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
reiserfs_update_sd(&th, inode);
ihold(inode);
@@ -1325,7 +1326,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
int jbegin_count;
umode_t old_inode_mode;
unsigned long savelink = 1;
- struct timespec64 ctime;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -1576,14 +1576,11 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
journal_mark_dirty(&th, old_de.de_bh);
- ctime = current_time(old_dir);
- old_dir->i_ctime = old_dir->i_mtime = ctime;
- new_dir->i_ctime = new_dir->i_mtime = ctime;
/*
* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
* which adds ctime update of renamed object
*/
- old_inode->i_ctime = ctime;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (new_dentry_inode) {
/* adjust link number of the victim */
@@ -1592,7 +1589,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
} else {
drop_nlink(new_dentry_inode);
}
- new_dentry_inode->i_ctime = ctime;
savelink = new_dentry_inode->i_nlink;
}
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 55e85256aae8..7d12b8c5b2fa 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2373,7 +2373,7 @@ struct virtual_node {
struct direntry_uarea {
int flags;
__u16 entry_count;
- __u16 entry_sizes[1];
+ __u16 entry_sizes[];
} __attribute__ ((__packed__));
/***************************************************************************
@@ -2699,7 +2699,7 @@ struct reiserfs_iget_args {
#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
#define journal_trans_half(blocksize) \
- ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32))
+ ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32))
/* journal.c see journal.c for all the comments here */
@@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc {
__le32 j_len;
__le32 j_mount_id; /* mount id of this trans */
- __le32 j_realblock[1]; /* real locations for each block */
+ __le32 j_realblock[]; /* real locations for each block */
};
#define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id)
@@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc {
struct reiserfs_journal_commit {
__le32 j_trans_id; /* must match j_trans_id from the desc block */
__le32 j_len; /* ditto */
- __le32 j_realblock[1]; /* real locations for each block */
+ __le32 j_realblock[]; /* real locations for each block */
};
#define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index ce5003986789..3676e02a0232 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2004,7 +2004,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
if (update_timestamps) {
inode->i_mtime = current_time(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
@@ -2029,7 +2029,7 @@ update_and_out:
if (update_timestamps) {
/* this is truncate, not file closing */
inode->i_mtime = current_time(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
reiserfs_update_sd(th, inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 929acce6e731..7eaf36b3de12 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2587,7 +2587,7 @@ out:
return err;
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return len - towrite;
}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 651027967159..6000964c2b80 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -466,12 +466,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
static void update_ctime(struct inode *inode)
{
struct timespec64 now = current_time(inode);
+ struct timespec64 ctime = inode_get_ctime(inode);
if (inode_unhashed(inode) || !inode->i_nlink ||
- timespec64_equal(&inode->i_ctime, &now))
+ timespec64_equal(&ctime, &now))
return;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_to_ts(inode, now);
mark_inode_dirty(inode);
}
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 138060452678..064264992b49 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -285,7 +285,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
if (error == -ENODATA) {
error = 0;
if (type == ACL_TYPE_ACCESS) {
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
}
}
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 8eb87008b55a..f24a96a331af 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -57,6 +57,7 @@ endchoice
config ROMFS_ON_BLOCK
bool
default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
+ select BUFFER_HEAD
config ROMFS_ON_MTD
bool
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index c59b230d55b4..5c35f6c76037 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -322,8 +322,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
set_nlink(i, 1); /* Hard to decide.. */
i->i_size = be32_to_cpu(ri.size);
- i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
- i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
+ i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0);
/* set up mode and ops */
mode = romfs_modemap[nextfh & ROMFH_TYPE];
@@ -583,16 +582,18 @@ static int romfs_init_fs_context(struct fs_context *fc)
*/
static void romfs_kill_sb(struct super_block *sb)
{
+ generic_shutdown_super(sb);
+
#ifdef CONFIG_ROMFS_ON_MTD
if (sb->s_mtd) {
- kill_mtd_super(sb);
- return;
+ put_mtd_device(sb->s_mtd);
+ sb->s_mtd = NULL;
}
#endif
#ifdef CONFIG_ROMFS_ON_BLOCK
if (sb->s_bdev) {
- kill_block_super(sb);
- return;
+ sync_blockdev(sb->s_bdev);
+ blkdev_put(sb->s_bdev, sb);
}
#endif
}
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 4c0d53bf931a..2927bd174a88 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -3,6 +3,7 @@ config CIFS
tristate "SMB3 and CIFS support (advanced network filesystem)"
depends on INET
select NLS
+ select NLS_UCS2_UTILS
select CRYPTO
select CRYPTO_MD5
select CRYPTO_SHA256
diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile
index 304a7f6cc13a..0b07eb94c93b 100644
--- a/fs/smb/client/Makefile
+++ b/fs/smb/client/Makefile
@@ -11,7 +11,8 @@ cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \
readdir.o ioctl.o sess.o export.o unc.o winucase.o \
smb2ops.o smb2maperror.o smb2transport.o \
smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
- dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o
+ dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o \
+ namespace.o
$(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h
@@ -21,7 +22,7 @@ cifs-$(CONFIG_CIFS_XATTR) += xattr.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o dfs.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += dfs_cache.o dfs.o
cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index fe483f163dbc..fe1bf5b6e0cb 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -15,10 +15,12 @@
static struct cached_fid *init_cached_dir(const char *path);
static void free_cached_dir(struct cached_fid *cfid);
static void smb2_close_cached_fid(struct kref *ref);
+static void cfids_laundromat_worker(struct work_struct *work);
static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
const char *path,
- bool lookup_only)
+ bool lookup_only,
+ __u32 max_cached_dirs)
{
struct cached_fid *cfid;
@@ -43,7 +45,7 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
- if (cfids->num_entries >= MAX_CACHED_FIDS) {
+ if (cfids->num_entries >= max_cached_dirs) {
spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
@@ -145,7 +147,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
const char *npath;
if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache ||
- is_smb1_server(tcon->ses->server))
+ is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0))
return -EOPNOTSUPP;
ses = tcon->ses;
@@ -162,21 +164,24 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
if (!utf16_path)
return -ENOMEM;
- cfid = find_or_create_cached_dir(cfids, path, lookup_only);
+ cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs);
if (cfid == NULL) {
kfree(utf16_path);
return -ENOENT;
}
/*
- * At this point we either have a lease already and we can just
- * return it. If not we are guaranteed to be the only thread accessing
- * this cfid.
+ * Return cached fid if it has a lease. Otherwise, it is either a new
+ * entry or laundromat worker removed it from @cfids->entries. Caller
+ * will put last reference if the latter.
*/
+ spin_lock(&cfids->cfid_list_lock);
if (cfid->has_lease) {
+ spin_unlock(&cfids->cfid_list_lock);
*ret_cfid = cfid;
kfree(utf16_path);
return 0;
}
+ spin_unlock(&cfids->cfid_list_lock);
/*
* Skip any prefix paths in @path as lookup_positive_unlocked() ends up
@@ -218,7 +223,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
.tcon = tcon,
.path = path,
.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE),
- .desired_access = FILE_READ_ATTRIBUTES,
+ .desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES,
.disposition = FILE_OPEN,
.fid = pfid,
};
@@ -293,9 +298,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
goto oshr_free;
}
}
+ spin_lock(&cfids->cfid_list_lock);
cfid->dentry = dentry;
cfid->time = jiffies;
cfid->has_lease = true;
+ spin_unlock(&cfids->cfid_list_lock);
oshr_free:
kfree(utf16_path);
@@ -304,24 +311,28 @@ oshr_free:
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
spin_lock(&cfids->cfid_list_lock);
- if (rc && !cfid->has_lease) {
- if (cfid->on_list) {
- list_del(&cfid->entry);
- cfid->on_list = false;
- cfids->num_entries--;
+ if (!cfid->has_lease) {
+ if (rc) {
+ if (cfid->on_list) {
+ list_del(&cfid->entry);
+ cfid->on_list = false;
+ cfids->num_entries--;
+ }
+ rc = -ENOENT;
+ } else {
+ /*
+ * We are guaranteed to have two references at this
+ * point. One for the caller and one for a potential
+ * lease. Release the Lease-ref so that the directory
+ * will be closed when the caller closes the cached
+ * handle.
+ */
+ spin_unlock(&cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
+ goto out;
}
- rc = -ENOENT;
}
spin_unlock(&cfids->cfid_list_lock);
- if (!rc && !cfid->has_lease) {
- /*
- * We are guaranteed to have two references at this point.
- * One for the caller and one for a potential lease.
- * Release the Lease-ref so that the directory will be closed
- * when the caller closes the cached handle.
- */
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- }
if (rc) {
if (cfid->is_open)
SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
@@ -329,7 +340,7 @@ oshr_free:
free_cached_dir(cfid);
cfid = NULL;
}
-
+out:
if (rc == 0) {
*ret_cfid = cfid;
atomic_inc(&tcon->num_remote_opens);
@@ -451,6 +462,9 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon)
struct cached_fid *cfid, *q;
LIST_HEAD(entry);
+ if (cfids == NULL)
+ return;
+
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
list_move(&cfid->entry, &entry);
@@ -568,53 +582,51 @@ static void free_cached_dir(struct cached_fid *cfid)
kfree(cfid);
}
-static int
-cifs_cfids_laundromat_thread(void *p)
+static void cfids_laundromat_worker(struct work_struct *work)
{
- struct cached_fids *cfids = p;
+ struct cached_fids *cfids;
struct cached_fid *cfid, *q;
- struct list_head entry;
+ LIST_HEAD(entry);
- while (!kthread_should_stop()) {
- ssleep(1);
- INIT_LIST_HEAD(&entry);
- if (kthread_should_stop())
- return 0;
- spin_lock(&cfids->cfid_list_lock);
- list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
- if (time_after(jiffies, cfid->time + HZ * 30)) {
- list_del(&cfid->entry);
- list_add(&cfid->entry, &entry);
- cfids->num_entries--;
- }
- }
- spin_unlock(&cfids->cfid_list_lock);
+ cfids = container_of(work, struct cached_fids, laundromat_work.work);
- list_for_each_entry_safe(cfid, q, &entry, entry) {
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
+ if (cfid->time &&
+ time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) {
cfid->on_list = false;
- list_del(&cfid->entry);
+ list_move(&cfid->entry, &entry);
+ cfids->num_entries--;
+ /* To prevent race with smb2_cached_lease_break() */
+ kref_get(&cfid->refcount);
+ }
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+ list_for_each_entry_safe(cfid, q, &entry, entry) {
+ list_del(&cfid->entry);
+ /*
+ * Cancel and wait for the work to finish in case we are racing
+ * with it.
+ */
+ cancel_work_sync(&cfid->lease_break);
+ if (cfid->has_lease) {
/*
- * Cancel, and wait for the work to finish in
- * case we are racing with it.
+ * Our lease has not yet been cancelled from the server
+ * so we need to drop the reference.
*/
- cancel_work_sync(&cfid->lease_break);
- if (cfid->has_lease) {
- /*
- * We lease has not yet been cancelled from
- * the server so we need to drop the reference.
- */
- spin_lock(&cfids->cfid_list_lock);
- cfid->has_lease = false;
- spin_unlock(&cfids->cfid_list_lock);
- kref_put(&cfid->refcount, smb2_close_cached_fid);
- }
+ spin_lock(&cfids->cfid_list_lock);
+ cfid->has_lease = false;
+ spin_unlock(&cfids->cfid_list_lock);
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
+ /* Drop the extra reference opened above */
+ kref_put(&cfid->refcount, smb2_close_cached_fid);
}
-
- return 0;
+ queue_delayed_work(cifsiod_wq, &cfids->laundromat_work,
+ dir_cache_timeout * HZ);
}
-
struct cached_fids *init_cached_dirs(void)
{
struct cached_fids *cfids;
@@ -625,19 +637,10 @@ struct cached_fids *init_cached_dirs(void)
spin_lock_init(&cfids->cfid_list_lock);
INIT_LIST_HEAD(&cfids->entries);
- /*
- * since we're in a cifs function already, we know that
- * this will succeed. No need for try_module_get().
- */
- __module_get(THIS_MODULE);
- cfids->laundromat = kthread_run(cifs_cfids_laundromat_thread,
- cfids, "cifsd-cfid-laundromat");
- if (IS_ERR(cfids->laundromat)) {
- cifs_dbg(VFS, "Failed to start cfids laundromat thread.\n");
- kfree(cfids);
- module_put(THIS_MODULE);
- return NULL;
- }
+ INIT_DELAYED_WORK(&cfids->laundromat_work, cfids_laundromat_worker);
+ queue_delayed_work(cifsiod_wq, &cfids->laundromat_work,
+ dir_cache_timeout * HZ);
+
return cfids;
}
@@ -650,11 +653,10 @@ void free_cached_dirs(struct cached_fids *cfids)
struct cached_fid *cfid, *q;
LIST_HEAD(entry);
- if (cfids->laundromat) {
- kthread_stop(cfids->laundromat);
- cfids->laundromat = NULL;
- module_put(THIS_MODULE);
- }
+ if (cfids == NULL)
+ return;
+
+ cancel_delayed_work_sync(&cfids->laundromat_work);
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h
index facc9b154d00..81ba0fd5cc16 100644
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -49,7 +49,7 @@ struct cached_fid {
struct cached_dirents dirents;
};
-#define MAX_CACHED_FIDS 16
+/* default MAX_CACHED_FIDS is 16 */
struct cached_fids {
/* Must be held when:
* - accessing the cfids->entries list
@@ -57,7 +57,7 @@ struct cached_fids {
spinlock_t cfid_list_lock;
int num_entries;
struct list_head entries;
- struct task_struct *laundromat;
+ struct delayed_work laundromat_work;
};
extern struct cached_fids *init_cached_dirs(void);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index aec6e9137474..76922fcc4bc6 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -331,7 +331,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
/* channel info will be printed as a part of sessions below */
- if (CIFS_SERVER_IS_CHAN(server))
+ if (SERVER_IS_CHAN(server))
continue;
c++;
diff --git a/fs/smb/client/cifs_unicode.c b/fs/smb/client/cifs_unicode.c
index e7582dd79179..79d99a913944 100644
--- a/fs/smb/client/cifs_unicode.c
+++ b/fs/smb/client/cifs_unicode.c
@@ -8,7 +8,6 @@
#include <linux/slab.h>
#include "cifs_fs_sb.h"
#include "cifs_unicode.h"
-#include "cifs_uniupr.h"
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifs_debug.h"
diff --git a/fs/smb/client/cifs_unicode.h b/fs/smb/client/cifs_unicode.h
index 80b3d845419f..e137a0dfbbe9 100644
--- a/fs/smb/client/cifs_unicode.h
+++ b/fs/smb/client/cifs_unicode.h
@@ -21,21 +21,7 @@
#include <asm/byteorder.h>
#include <linux/types.h>
#include <linux/nls.h>
-
-#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
-
-/*
- * Windows maps these to the user defined 16 bit Unicode range since they are
- * reserved symbols (along with \ and /), otherwise illegal to store
- * in filenames in NTFS
- */
-#define UNI_ASTERISK (__u16) ('*' + 0xF000)
-#define UNI_QUESTION (__u16) ('?' + 0xF000)
-#define UNI_COLON (__u16) (':' + 0xF000)
-#define UNI_GRTRTHAN (__u16) ('>' + 0xF000)
-#define UNI_LESSTHAN (__u16) ('<' + 0xF000)
-#define UNI_PIPE (__u16) ('|' + 0xF000)
-#define UNI_SLASH (__u16) ('\\' + 0xF000)
+#include "../../nls/nls_ucs2_utils.h"
/*
* Macs use an older "SFM" mapping of the symbols above. Fortunately it does
@@ -68,27 +54,6 @@
#define SFM_MAP_UNI_RSVD 1
#define SFU_MAP_UNI_RSVD 2
-/* Just define what we want from uniupr.h. We don't want to define the tables
- * in each source file.
- */
-#ifndef UNICASERANGE_DEFINED
-struct UniCaseRange {
- wchar_t start;
- wchar_t end;
- signed char *table;
-};
-#endif /* UNICASERANGE_DEFINED */
-
-#ifndef UNIUPR_NOUPPER
-extern signed char CifsUniUpperTable[512];
-extern const struct UniCaseRange CifsUniUpperRange[];
-#endif /* UNIUPR_NOUPPER */
-
-#ifndef UNIUPR_NOLOWER
-extern signed char CifsUniLowerTable[512];
-extern const struct UniCaseRange CifsUniLowerRange[];
-#endif /* UNIUPR_NOLOWER */
-
#ifdef __KERNEL__
int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
const struct nls_table *cp, int map_type);
@@ -108,297 +73,4 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
wchar_t cifs_toupper(wchar_t in);
-/*
- * UniStrcat: Concatenate the second string to the first
- *
- * Returns:
- * Address of the first string
- */
-static inline __le16 *
-UniStrcat(__le16 *ucs1, const __le16 *ucs2)
-{
- __le16 *anchor = ucs1; /* save a pointer to start of ucs1 */
-
- while (*ucs1++) ; /* To end of first string */
- ucs1--; /* Return to the null */
- while ((*ucs1++ = *ucs2++)) ; /* copy string 2 over */
- return anchor;
-}
-
-/*
- * UniStrchr: Find a character in a string
- *
- * Returns:
- * Address of first occurrence of character in string
- * or NULL if the character is not in the string
- */
-static inline wchar_t *
-UniStrchr(const wchar_t *ucs, wchar_t uc)
-{
- while ((*ucs != uc) && *ucs)
- ucs++;
-
- if (*ucs == uc)
- return (wchar_t *) ucs;
- return NULL;
-}
-
-/*
- * UniStrcmp: Compare two strings
- *
- * Returns:
- * < 0: First string is less than second
- * = 0: Strings are equal
- * > 0: First string is greater than second
- */
-static inline int
-UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2)
-{
- while ((*ucs1 == *ucs2) && *ucs1) {
- ucs1++;
- ucs2++;
- }
- return (int) *ucs1 - (int) *ucs2;
-}
-
-/*
- * UniStrcpy: Copy a string
- */
-static inline wchar_t *
-UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2)
-{
- wchar_t *anchor = ucs1; /* save the start of result string */
-
- while ((*ucs1++ = *ucs2++)) ;
- return anchor;
-}
-
-/*
- * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes)
- */
-static inline size_t
-UniStrlen(const wchar_t *ucs1)
-{
- int i = 0;
-
- while (*ucs1++)
- i++;
- return i;
-}
-
-/*
- * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a
- * string (length limited)
- */
-static inline size_t
-UniStrnlen(const wchar_t *ucs1, int maxlen)
-{
- int i = 0;
-
- while (*ucs1++) {
- i++;
- if (i >= maxlen)
- break;
- }
- return i;
-}
-
-/*
- * UniStrncat: Concatenate length limited string
- */
-static inline wchar_t *
-UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- wchar_t *anchor = ucs1; /* save pointer to string 1 */
-
- while (*ucs1++) ;
- ucs1--; /* point to null terminator of s1 */
- while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */
- ucs1++;
- ucs2++;
- }
- *ucs1 = 0; /* Null terminate the result */
- return (anchor);
-}
-
-/*
- * UniStrncmp: Compare length limited string
- */
-static inline int
-UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- if (!n)
- return 0; /* Null strings are equal */
- while ((*ucs1 == *ucs2) && *ucs1 && --n) {
- ucs1++;
- ucs2++;
- }
- return (int) *ucs1 - (int) *ucs2;
-}
-
-/*
- * UniStrncmp_le: Compare length limited string - native to little-endian
- */
-static inline int
-UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- if (!n)
- return 0; /* Null strings are equal */
- while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
- ucs1++;
- ucs2++;
- }
- return (int) *ucs1 - (int) __le16_to_cpu(*ucs2);
-}
-
-/*
- * UniStrncpy: Copy length limited string with pad
- */
-static inline wchar_t *
-UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- wchar_t *anchor = ucs1;
-
- while (n-- && *ucs2) /* Copy the strings */
- *ucs1++ = *ucs2++;
-
- n++;
- while (n--) /* Pad with nulls */
- *ucs1++ = 0;
- return anchor;
-}
-
-/*
- * UniStrncpy_le: Copy length limited string with pad to little-endian
- */
-static inline wchar_t *
-UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- wchar_t *anchor = ucs1;
-
- while (n-- && *ucs2) /* Copy the strings */
- *ucs1++ = __le16_to_cpu(*ucs2++);
-
- n++;
- while (n--) /* Pad with nulls */
- *ucs1++ = 0;
- return anchor;
-}
-
-/*
- * UniStrstr: Find a string in a string
- *
- * Returns:
- * Address of first match found
- * NULL if no matching string is found
- */
-static inline wchar_t *
-UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2)
-{
- const wchar_t *anchor1 = ucs1;
- const wchar_t *anchor2 = ucs2;
-
- while (*ucs1) {
- if (*ucs1 == *ucs2) {
- /* Partial match found */
- ucs1++;
- ucs2++;
- } else {
- if (!*ucs2) /* Match found */
- return (wchar_t *) anchor1;
- ucs1 = ++anchor1; /* No match */
- ucs2 = anchor2;
- }
- }
-
- if (!*ucs2) /* Both end together */
- return (wchar_t *) anchor1; /* Match found */
- return NULL; /* No match */
-}
-
-#ifndef UNIUPR_NOUPPER
-/*
- * UniToupper: Convert a unicode character to upper case
- */
-static inline wchar_t
-UniToupper(register wchar_t uc)
-{
- register const struct UniCaseRange *rp;
-
- if (uc < sizeof(CifsUniUpperTable)) {
- /* Latin characters */
- return uc + CifsUniUpperTable[uc]; /* Use base tables */
- } else {
- rp = CifsUniUpperRange; /* Use range tables */
- while (rp->start) {
- if (uc < rp->start) /* Before start of range */
- return uc; /* Uppercase = input */
- if (uc <= rp->end) /* In range */
- return uc + rp->table[uc - rp->start];
- rp++; /* Try next range */
- }
- }
- return uc; /* Past last range */
-}
-
-/*
- * UniStrupr: Upper case a unicode string
- */
-static inline __le16 *
-UniStrupr(register __le16 *upin)
-{
- register __le16 *up;
-
- up = upin;
- while (*up) { /* For all characters */
- *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
- up++;
- }
- return upin; /* Return input pointer */
-}
-#endif /* UNIUPR_NOUPPER */
-
-#ifndef UNIUPR_NOLOWER
-/*
- * UniTolower: Convert a unicode character to lower case
- */
-static inline wchar_t
-UniTolower(register wchar_t uc)
-{
- register const struct UniCaseRange *rp;
-
- if (uc < sizeof(CifsUniLowerTable)) {
- /* Latin characters */
- return uc + CifsUniLowerTable[uc]; /* Use base tables */
- } else {
- rp = CifsUniLowerRange; /* Use range tables */
- while (rp->start) {
- if (uc < rp->start) /* Before start of range */
- return uc; /* Uppercase = input */
- if (uc <= rp->end) /* In range */
- return uc + rp->table[uc - rp->start];
- rp++; /* Try next range */
- }
- }
- return uc; /* Past last range */
-}
-
-/*
- * UniStrlwr: Lower case a unicode string
- */
-static inline wchar_t *
-UniStrlwr(register wchar_t *upin)
-{
- register wchar_t *up;
-
- up = upin;
- while (*up) { /* For all characters */
- *up = UniTolower(*up);
- up++;
- }
- return upin; /* Return input pointer */
-}
-
-#endif
-
#endif /* _CIFS_UNICODE_H */
diff --git a/fs/smb/client/cifs_uniupr.h b/fs/smb/client/cifs_uniupr.h
deleted file mode 100644
index 7b272fcdf0d3..000000000000
--- a/fs/smb/client/cifs_uniupr.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (c) International Business Machines Corp., 2000,2002
- *
- * uniupr.h - Unicode compressed case ranges
-*/
-
-#ifndef UNIUPR_NOUPPER
-/*
- * Latin upper case
- */
-signed char CifsUniUpperTable[512] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */
- 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 060-06f */
- -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, 0, 0, 0, 0, 0, /* 070-07f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */
- -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 0e0-0ef */
- -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, -32, -32, -32, -32, -32, 121, /* 0f0-0ff */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */
- 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */
- -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */
- 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */
- 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */
- 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */
- -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */
- 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */
- -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */
- 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */
-};
-
-/* Upper case range - Greek */
-static signed char UniCaseRangeU03a0[47] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, /* 3a0-3af */
- 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 3b0-3bf */
- -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64,
- -63, -63,
-};
-
-/* Upper case range - Cyrillic */
-static signed char UniCaseRangeU0430[48] = {
- -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 430-43f */
- -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* 440-44f */
- 0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, 0, -80, -80, /* 450-45f */
-};
-
-/* Upper case range - Extended cyrillic */
-static signed char UniCaseRangeU0490[61] = {
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */
- 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1,
-};
-
-/* Upper case range - Extended latin and greek */
-static signed char UniCaseRangeU1e00[509] = {
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */
- 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1, /* 1e90-1e9f */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */
- 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */
- 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */
- 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */
- 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */
- 74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, 126, 126, 0, 0, /* 1f70-1f7f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */
- 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */
- 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */
- 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */
- 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */
- 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */
- 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-/* Upper case range - Wide latin */
-static signed char UniCaseRangeUff40[27] = {
- 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, /* ff40-ff4f */
- -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32,
-};
-
-/*
- * Upper Case Range
- */
-const struct UniCaseRange CifsUniUpperRange[] = {
- {0x03a0, 0x03ce, UniCaseRangeU03a0},
- {0x0430, 0x045f, UniCaseRangeU0430},
- {0x0490, 0x04cc, UniCaseRangeU0490},
- {0x1e00, 0x1ffc, UniCaseRangeU1e00},
- {0xff40, 0xff5a, UniCaseRangeUff40},
- {0}
-};
-#endif
-
-#ifndef UNIUPR_NOLOWER
-/*
- * Latin lower case
- */
-signed char CifsUniLowerTable[512] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */
- 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 040-04f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, 0, 0, 0, /* 050-05f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 0c0-0cf */
- 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 0, /* 0d0-0df */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */
- 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */
- 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, 0, /* 170-17f */
- 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, 0, /* 180-18f */
- 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */
- 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */
- 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */
- 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */
- 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */
- 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */
-};
-
-/* Lower case range - Greek */
-static signed char UniCaseRangeL0380[44] = {
- 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */
- 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 390-39f */
- 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-};
-
-/* Lower case range - Cyrillic */
-static signed char UniCaseRangeL0400[48] = {
- 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 0, 80, 80, /* 400-40f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 410-41f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* 420-42f */
-};
-
-/* Lower case range - Extended cyrillic */
-static signed char UniCaseRangeL0490[60] = {
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */
- 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-};
-
-/* Lower case range - Extended latin and greek */
-static signed char UniCaseRangeL1e00[504] = {
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */
- 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */
- 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */
- 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, 0, 0, /* 1fc0-1fcf */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */
- 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, 0, 0, /* 1fe0-1fef */
- 0, 0, 0, 0, 0, 0, 0, 0,
-};
-
-/* Lower case range - Wide latin */
-static signed char UniCaseRangeLff20[27] = {
- 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, /* ff20-ff2f */
- 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
-};
-
-/*
- * Lower Case Range
- */
-const struct UniCaseRange CifsUniLowerRange[] = {
- {0x0380, 0x03ab, UniCaseRangeL0380},
- {0x0400, 0x042f, UniCaseRangeL0400},
- {0x0490, 0x04cb, UniCaseRangeL0490},
- {0x1e00, 0x1ff7, UniCaseRangeL1e00},
- {0xff20, 0xff3a, UniCaseRangeLff20},
- {0}
-};
-#endif
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index a4d8b0ea1c8c..22869cda1356 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -117,6 +117,10 @@ module_param(cifs_max_pending, uint, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server for "
"CIFS/SMB1 dialect (N/A for SMB3) "
"Default: 32767 Range: 2 to 32767.");
+unsigned int dir_cache_timeout = 30;
+module_param(dir_cache_timeout, uint, 0644);
+MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 "
+ "Range: 1 to 65000 seconds, 0 to disable caching dir contents");
#ifdef CONFIG_CIFS_STATS2
unsigned int slow_rsp_threshold = 1;
module_param(slow_rsp_threshold, uint, 0644);
@@ -695,6 +699,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
if (tcon->handle_timeout)
seq_printf(s, ",handletimeout=%u", tcon->handle_timeout);
+ if (tcon->max_cached_dirs != MAX_CACHED_FIDS)
+ seq_printf(s, ",max_cached_dirs=%u", tcon->max_cached_dirs);
/*
* Display file and directory attribute timeout in seconds.
@@ -1077,7 +1083,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
}
static int
-cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv)
+cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
{
/*
* Note that this is called by vfs setlease with i_lock held to
@@ -1679,6 +1685,12 @@ init_cifs(void)
CIFS_MAX_REQ);
}
+ /* Limit max to about 18 hours, and setting to zero disables directory entry caching */
+ if (dir_cache_timeout > 65000) {
+ dir_cache_timeout = 65000;
+ cifs_dbg(VFS, "dir_cache_timeout set to max of 65000 seconds\n");
+ }
+
cifsiod_wq = alloc_workqueue("cifsiod", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
if (!cifsiod_wq) {
rc = -ENOMEM;
@@ -1805,7 +1817,7 @@ exit_cifs(void)
cifs_dbg(NOISY, "exit_smb3\n");
unregister_filesystem(&cifs_fs_type);
unregister_filesystem(&smb3_fs_type);
- cifs_dfs_release_automount_timer();
+ cifs_release_automount_timer();
exit_cifs_idmap();
#ifdef CONFIG_CIFS_SWN_UPCALL
cifs_genl_exit();
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 15c8cc4b6680..41daebd220ff 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -81,7 +81,7 @@ extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start,
extern const struct inode_operations cifs_file_inode_ops;
extern const struct inode_operations cifs_symlink_inode_ops;
-extern const struct inode_operations cifs_dfs_referral_inode_operations;
+extern const struct inode_operations cifs_namespace_inode_operations;
/* Functions related to files and directories */
@@ -118,14 +118,7 @@ extern void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned
extern const struct dentry_operations cifs_dentry_ops;
extern const struct dentry_operations cifs_ci_dentry_ops;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
-#else
-static inline struct vfsmount *cifs_dfs_d_automount(struct path *path)
-{
- return ERR_PTR(-EREMOTE);
-}
-#endif
+extern struct vfsmount *cifs_d_automount(struct path *path);
/* Functions related to symlinks */
extern const char *cifs_get_link(struct dentry *, struct inode *,
@@ -159,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 44
-#define CIFS_VERSION "2.44"
+#define SMB3_PRODUCT_BUILD 45
+#define CIFS_VERSION "2.45"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 657dee4b2c8c..02082621d8e0 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -186,6 +186,12 @@ struct cifs_cred {
};
struct cifs_open_info_data {
+ bool adjust_tz;
+ union {
+ bool reparse_point;
+ bool symlink;
+ };
+ __u32 reparse_tag;
char *symlink_target;
union {
struct smb2_file_all_info fi;
@@ -193,6 +199,10 @@ struct cifs_open_info_data {
};
};
+#define cifs_open_data_reparse(d) \
+ ((d)->reparse_point || \
+ (le32_to_cpu((d)->fi.Attributes) & ATTR_REPARSE))
+
static inline void cifs_free_open_info(struct cifs_open_info_data *data)
{
kfree(data->symlink_target);
@@ -318,16 +328,21 @@ struct smb_version_operations {
int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const char *);
/* query path data from the server */
- int (*query_path_info)(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
+ int (*query_path_info)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ struct cifs_open_info_data *data);
/* query file data from the server */
int (*query_file_info)(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile, struct cifs_open_info_data *data);
- /* query reparse tag from srv to determine which type of special file */
- int (*query_reparse_tag)(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *path,
- __u32 *reparse_tag);
+ /* query reparse point to determine which type of special file */
+ int (*query_reparse_point)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype);
/* get server index number */
int (*get_srv_inum)(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path, u64 *uniqueid,
@@ -376,9 +391,12 @@ struct smb_version_operations {
const char *, const char *,
struct cifs_sb_info *);
/* query symlink target */
- int (*query_symlink)(const unsigned int, struct cifs_tcon *,
- struct cifs_sb_info *, const char *,
- char **, bool);
+ int (*query_symlink)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ char **target_path,
+ struct kvec *rsp_iov);
/* open a file for non-posix mounts */
int (*open)(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
void *buf);
@@ -727,8 +745,9 @@ struct TCP_Server_Info {
* primary_server holds the ref-counted
* pointer to primary channel connection for the session.
*/
-#define CIFS_SERVER_IS_CHAN(server) (!!(server)->primary_server)
+#define SERVER_IS_CHAN(server) (!!(server)->primary_server)
struct TCP_Server_Info *primary_server;
+ __u16 channel_sequence_num; /* incremented on primary channel on each chan reconnect */
#ifdef CONFIG_CIFS_SWN_UPCALL
bool use_swn_dstaddr;
@@ -1076,7 +1095,7 @@ cap_unix(struct cifs_ses *ses)
* inode with new info
*/
-#define CIFS_FATTR_DFS_REFERRAL 0x1
+#define CIFS_FATTR_JUNCTION 0x1
#define CIFS_FATTR_DELETE_PENDING 0x2
#define CIFS_FATTR_NEED_REVAL 0x4
#define CIFS_FATTR_INO_COLLISION 0x8
@@ -1191,6 +1210,7 @@ struct cifs_tcon {
__u32 max_chunks;
__u32 max_bytes_chunk;
__u32 max_bytes_copy;
+ __u32 max_cached_dirs;
#ifdef CONFIG_CIFS_FSCACHE
u64 resource_id; /* server resource id */
struct fscache_volume *fscache; /* cookie for share */
@@ -1721,11 +1741,23 @@ struct cifs_mount_ctx {
struct list_head dfs_ses_list;
};
+static inline void __free_dfs_info_param(struct dfs_info3_param *param)
+{
+ kfree(param->path_name);
+ kfree(param->node_name);
+}
+
static inline void free_dfs_info_param(struct dfs_info3_param *param)
{
+ if (param)
+ __free_dfs_info_param(param);
+}
+
+static inline void zfree_dfs_info_param(struct dfs_info3_param *param)
+{
if (param) {
- kfree(param->path_name);
- kfree(param->node_name);
+ __free_dfs_info_param(param);
+ memset(param, 0, sizeof(*param));
}
}
@@ -1775,6 +1807,7 @@ static inline bool is_retryable_error(int error)
#define MID_RETRY_NEEDED 8 /* session closed while this request out */
#define MID_RESPONSE_MALFORMED 0x10
#define MID_SHUTDOWN 0x20
+#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */
/* Flags */
#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
@@ -1911,7 +1944,7 @@ require use of the stronger protocol */
* cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once
* ->can_cache_brlcks
* cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc
- * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc
+ * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc
* cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
* cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
* ->invalidHandle initiate_cifs_search
@@ -1985,6 +2018,7 @@ extern unsigned int CIFSMaxBufSize; /* max size not including hdr */
extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */
extern unsigned int cifs_min_small; /* min size of small buf pool */
extern unsigned int cifs_max_pending; /* MAX requests at once to server*/
+extern unsigned int dir_cache_timeout; /* max time for directory lease caching of dir */
extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */
extern atomic_t mid_count;
@@ -2184,4 +2218,17 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable,
}
}
+struct smb2_compound_vars {
+ struct cifs_open_parms oparms;
+ struct kvec rsp_iov[3];
+ struct smb_rqst rqst[3];
+ struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
+ struct kvec qi_iov;
+ struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
+ struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
+ struct kvec close_iov;
+ struct smb2_file_rename_info rename_info;
+ struct smb2_file_link_info link_info;
+};
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 1d71d658e167..0c37eefa18a5 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -207,6 +207,9 @@ extern struct inode *cifs_iget(struct super_block *sb,
int cifs_get_inode_info(struct inode **inode, const char *full_path,
struct cifs_open_info_data *data, struct super_block *sb, int xid,
const struct cifs_fid *fid);
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+ struct cifs_fattr *fattr,
+ u32 tag);
extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
struct super_block *sb, unsigned int xid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
@@ -295,11 +298,7 @@ extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
int from_reconnect);
extern void cifs_put_tcon(struct cifs_tcon *tcon);
-#if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
-extern void cifs_dfs_release_automount_timer(void);
-#else /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
-#define cifs_dfs_release_automount_timer() do { } while (0)
-#endif /* ! IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) */
+extern void cifs_release_automount_timer(void);
void cifs_proc_init(void);
void cifs_proc_clean(void);
@@ -513,7 +512,7 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses);
extern struct cifs_ses *sesInfoAlloc(void);
extern void sesInfoFree(struct cifs_ses *);
-extern struct cifs_tcon *tconInfoAlloc(void);
+extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled);
extern void tconInfoFree(struct cifs_tcon *);
extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 238538dde4e3..7b923e36501b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -154,7 +154,7 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
int i;
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
spin_lock(&pserver->srv_lock);
if (!all_channels) {
@@ -202,7 +202,7 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
cifs_dbg(FYI, "%s: marking necessary sessions and tcons for reconnect\n", __func__);
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
spin_lock(&cifs_tcp_ses_lock);
@@ -453,10 +453,10 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_
static int reconnect_dfs_server(struct TCP_Server_Info *server)
{
- int rc = 0;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
struct dfs_cache_tgt_iterator *target_hint = NULL;
+ DFS_CACHE_TGT_LIST(tl);
int num_targets = 0;
+ int rc = 0;
/*
* Determine the number of dfs targets the referral path in @cifs_sb resolves to.
@@ -911,8 +911,8 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required)
return 0;
}
-
-static void clean_demultiplex_info(struct TCP_Server_Info *server)
+static noinline_for_stack void
+clean_demultiplex_info(struct TCP_Server_Info *server)
{
int length;
@@ -1551,7 +1551,7 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
* Skip ses channels since they're only handled in lower layers
* (e.g. cifs_send_recv).
*/
- if (CIFS_SERVER_IS_CHAN(server) ||
+ if (SERVER_IS_CHAN(server) ||
!match_server(server, ctx, false)) {
spin_unlock(&server->srv_lock);
continue;
@@ -1587,7 +1587,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
spin_unlock(&cifs_tcp_ses_lock);
/* For secondary channels, we pick up ref-count on the primary server */
- if (CIFS_SERVER_IS_CHAN(server))
+ if (SERVER_IS_CHAN(server))
cifs_put_tcp_session(server->primary_server, from_reconnect);
cancel_delayed_work_sync(&server->echo);
@@ -1686,6 +1686,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
+ tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */
tcp_ses->reconnect_instance = 1;
tcp_ses->lstrp = jiffies;
tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression);
@@ -1792,7 +1793,7 @@ out_err_crypto_release:
out_err:
if (tcp_ses) {
- if (CIFS_SERVER_IS_CHAN(tcp_ses))
+ if (SERVER_IS_CHAN(tcp_ses))
cifs_put_tcp_session(tcp_ses->primary_server, false);
kfree(tcp_ses->hostname);
kfree(tcp_ses->leaf_fullpath);
@@ -1881,7 +1882,8 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
}
}
- tcon = tconInfoAlloc();
+ /* no need to setup directory caching on IPC share, so pass in false */
+ tcon = tcon_info_alloc(false);
if (tcon == NULL)
return -ENOMEM;
@@ -2472,8 +2474,9 @@ cifs_put_tcon(struct cifs_tcon *tcon)
static struct cifs_tcon *
cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
{
- int rc, xid;
struct cifs_tcon *tcon;
+ bool nohandlecache;
+ int rc, xid;
tcon = cifs_find_tcon(ses, ctx);
if (tcon) {
@@ -2491,11 +2494,17 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
goto out_fail;
}
- tcon = tconInfoAlloc();
+ if (ses->server->dialect >= SMB20_PROT_ID &&
+ (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING))
+ nohandlecache = ctx->nohandlecache;
+ else
+ nohandlecache = true;
+ tcon = tcon_info_alloc(!nohandlecache);
if (tcon == NULL) {
rc = -ENOMEM;
goto out_fail;
}
+ tcon->nohandlecache = nohandlecache;
if (ctx->snapshot_time) {
if (ses->server->vals->protocol_id == 0) {
@@ -2656,10 +2665,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->retry = ctx->retry;
tcon->nocase = ctx->nocase;
tcon->broken_sparse_sup = ctx->no_sparse;
- if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)
- tcon->nohandlecache = ctx->nohandlecache;
- else
- tcon->nohandlecache = true;
+ tcon->max_cached_dirs = ctx->max_cached_dirs;
tcon->nodelete = ctx->nodelete;
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
@@ -2889,9 +2895,9 @@ bind_socket(struct TCP_Server_Info *server)
if (server->srcaddr.ss_family != AF_UNSPEC) {
/* Bind to the specified local IP address */
struct socket *socket = server->ssocket;
- rc = socket->ops->bind(socket,
- (struct sockaddr *) &server->srcaddr,
- sizeof(server->srcaddr));
+ rc = kernel_bind(socket,
+ (struct sockaddr *) &server->srcaddr,
+ sizeof(server->srcaddr));
if (rc < 0) {
struct sockaddr_in *saddr4;
struct sockaddr_in6 *saddr6;
@@ -3040,8 +3046,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
socket->sk->sk_sndbuf,
socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo);
- rc = socket->ops->connect(socket, saddr, slen,
- server->noblockcnt ? O_NONBLOCK : 0);
+ rc = kernel_connect(socket, saddr, slen,
+ server->noblockcnt ? O_NONBLOCK : 0);
/*
* When mounting SMB root file systems, we do not want to block in
* connect. Otherwise bail out and then let cifs_reconnect() perform
@@ -3813,7 +3819,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct nls_table *nls_info)
{
int rc = -ENOSYS;
- struct TCP_Server_Info *pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ struct TCP_Server_Info *pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr;
struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr;
bool is_binding = false;
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index ee772c3d9f00..81b84151450d 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -3,7 +3,6 @@
* Copyright (c) 2022 Paulo Alcantara <palcantara@suse.de>
*/
-#include <linux/namei.h>
#include "cifsproto.h"
#include "cifs_debug.h"
#include "dns_resolve.h"
@@ -96,51 +95,134 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
return 0;
}
-static int get_dfs_conn(struct cifs_mount_ctx *mnt_ctx, const char *ref_path, const char *full_path,
- const struct dfs_cache_tgt_iterator *tit)
+static inline int parse_dfs_target(struct smb3_fs_context *ctx,
+ struct dfs_ref_walk *rw,
+ struct dfs_info3_param *tgt)
+{
+ int rc;
+ const char *fpath = ref_walk_fpath(rw) + 1;
+
+ rc = ref_walk_get_tgt(rw, tgt);
+ if (!rc)
+ rc = dfs_parse_target_referral(fpath, tgt, ctx);
+ return rc;
+}
+
+static int set_ref_paths(struct cifs_mount_ctx *mnt_ctx,
+ struct dfs_info3_param *tgt,
+ struct dfs_ref_walk *rw)
{
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct dfs_info3_param ref = {};
- bool is_refsrv;
- int rc, rc2;
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ char *ref_path, *full_path;
+ int rc;
- rc = dfs_cache_get_tgt_referral(ref_path + 1, tit, &ref);
- if (rc)
+ full_path = smb3_fs_context_fullpath(ctx, CIFS_DIR_SEP(cifs_sb));
+ if (IS_ERR(full_path))
+ return PTR_ERR(full_path);
+
+ if (!tgt || (tgt->server_type == DFS_TYPE_LINK &&
+ DFS_INTERLINK(tgt->flags)))
+ ref_path = dfs_get_path(cifs_sb, ctx->UNC);
+ else
+ ref_path = dfs_get_path(cifs_sb, full_path);
+ if (IS_ERR(ref_path)) {
+ rc = PTR_ERR(ref_path);
+ kfree(full_path);
return rc;
+ }
+ ref_walk_path(rw) = ref_path;
+ ref_walk_fpath(rw) = full_path;
+ return 0;
+}
- rc = dfs_parse_target_referral(full_path + 1, &ref, ctx);
- if (rc)
- goto out;
+static int __dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx,
+ struct dfs_ref_walk *rw)
+{
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ struct dfs_info3_param tgt = {};
+ bool is_refsrv;
+ int rc = -ENOENT;
- cifs_mount_put_conns(mnt_ctx);
- rc = get_session(mnt_ctx, ref_path);
- if (rc)
- goto out;
+again:
+ do {
+ if (ref_walk_empty(rw)) {
+ rc = dfs_get_referral(mnt_ctx, ref_walk_path(rw) + 1,
+ NULL, ref_walk_tl(rw));
+ if (rc) {
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (!rc)
+ rc = cifs_is_path_remote(mnt_ctx);
+ continue;
+ }
+ if (!ref_walk_num_tgts(rw)) {
+ rc = -ENOENT;
+ continue;
+ }
+ }
- is_refsrv = !!(ref.flags & DFSREF_REFERRAL_SERVER);
+ while (ref_walk_next_tgt(rw)) {
+ rc = parse_dfs_target(ctx, rw, &tgt);
+ if (rc)
+ continue;
- rc = -EREMOTE;
- if (ref.flags & DFSREF_STORAGE_SERVER) {
- rc = cifs_mount_get_tcon(mnt_ctx);
- if (rc)
- goto out;
+ cifs_mount_put_conns(mnt_ctx);
+ rc = get_session(mnt_ctx, ref_walk_path(rw));
+ if (rc)
+ continue;
- /* some servers may not advertise referral capability under ref.flags */
- is_refsrv |= is_tcon_dfs(mnt_ctx->tcon);
+ is_refsrv = tgt.server_type == DFS_TYPE_ROOT ||
+ DFS_INTERLINK(tgt.flags);
+ ref_walk_set_tgt_hint(rw);
- rc = cifs_is_path_remote(mnt_ctx);
- }
+ if (tgt.flags & DFSREF_STORAGE_SERVER) {
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (!rc)
+ rc = cifs_is_path_remote(mnt_ctx);
+ if (!rc)
+ break;
+ if (rc != -EREMOTE)
+ continue;
+ }
- dfs_cache_noreq_update_tgthint(ref_path + 1, tit);
+ if (is_refsrv) {
+ rc = add_root_smb_session(mnt_ctx);
+ if (rc)
+ goto out;
+ }
- if (rc == -EREMOTE && is_refsrv) {
- rc2 = add_root_smb_session(mnt_ctx);
- if (rc2)
- rc = rc2;
- }
+ rc = ref_walk_advance(rw);
+ if (!rc) {
+ rc = set_ref_paths(mnt_ctx, &tgt, rw);
+ if (!rc) {
+ rc = -EREMOTE;
+ goto again;
+ }
+ }
+ if (rc != -ELOOP)
+ goto out;
+ }
+ } while (rc && ref_walk_descend(rw));
out:
- free_dfs_info_param(&ref);
+ free_dfs_info_param(&tgt);
+ return rc;
+}
+
+static int dfs_referral_walk(struct cifs_mount_ctx *mnt_ctx)
+{
+ struct dfs_ref_walk *rw;
+ int rc;
+
+ rw = ref_walk_alloc();
+ if (IS_ERR(rw))
+ return PTR_ERR(rw);
+
+ ref_walk_init(rw);
+ rc = set_ref_paths(mnt_ctx, NULL, rw);
+ if (!rc)
+ rc = __dfs_referral_walk(mnt_ctx, rw);
+ ref_walk_free(rw);
return rc;
}
@@ -148,105 +230,48 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
{
struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- char *ref_path = NULL, *full_path = NULL;
- struct dfs_cache_tgt_iterator *tit;
struct cifs_tcon *tcon;
- char *origin_fullpath = NULL;
- char sep = CIFS_DIR_SEP(cifs_sb);
- int num_links = 0;
+ char *origin_fullpath;
int rc;
- ref_path = dfs_get_path(cifs_sb, ctx->UNC);
- if (IS_ERR(ref_path))
- return PTR_ERR(ref_path);
+ origin_fullpath = dfs_get_path(cifs_sb, ctx->source);
+ if (IS_ERR(origin_fullpath))
+ return PTR_ERR(origin_fullpath);
- full_path = smb3_fs_context_fullpath(ctx, sep);
- if (IS_ERR(full_path)) {
- rc = PTR_ERR(full_path);
- full_path = NULL;
+ rc = dfs_referral_walk(mnt_ctx);
+ if (rc)
goto out;
- }
- origin_fullpath = kstrdup(full_path, GFP_KERNEL);
- if (!origin_fullpath) {
- rc = -ENOMEM;
- goto out;
+ tcon = mnt_ctx->tcon;
+ spin_lock(&tcon->tc_lock);
+ if (!tcon->origin_fullpath) {
+ tcon->origin_fullpath = origin_fullpath;
+ origin_fullpath = NULL;
}
+ spin_unlock(&tcon->tc_lock);
- do {
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
-
- rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl);
- if (rc) {
- rc = cifs_mount_get_tcon(mnt_ctx);
- if (!rc)
- rc = cifs_is_path_remote(mnt_ctx);
- break;
- }
-
- tit = dfs_cache_get_tgt_iterator(&tl);
- if (!tit) {
- cifs_dbg(VFS, "%s: dfs referral (%s) with no targets\n", __func__,
- ref_path + 1);
- rc = -ENOENT;
- dfs_cache_free_tgts(&tl);
- break;
- }
-
- do {
- rc = get_dfs_conn(mnt_ctx, ref_path, full_path, tit);
- if (!rc)
- break;
- if (rc == -EREMOTE) {
- if (++num_links > MAX_NESTED_LINKS) {
- rc = -ELOOP;
- break;
- }
- kfree(ref_path);
- kfree(full_path);
- ref_path = full_path = NULL;
-
- full_path = smb3_fs_context_fullpath(ctx, sep);
- if (IS_ERR(full_path)) {
- rc = PTR_ERR(full_path);
- full_path = NULL;
- } else {
- ref_path = dfs_get_path(cifs_sb, full_path);
- if (IS_ERR(ref_path)) {
- rc = PTR_ERR(ref_path);
- ref_path = NULL;
- }
- }
- break;
- }
- } while ((tit = dfs_cache_get_next_tgt(&tl, tit)));
- dfs_cache_free_tgts(&tl);
- } while (rc == -EREMOTE);
-
- if (!rc) {
- tcon = mnt_ctx->tcon;
-
- spin_lock(&tcon->tc_lock);
- if (!tcon->origin_fullpath) {
- tcon->origin_fullpath = origin_fullpath;
- origin_fullpath = NULL;
- }
- spin_unlock(&tcon->tc_lock);
-
- if (list_empty(&tcon->dfs_ses_list)) {
- list_replace_init(&mnt_ctx->dfs_ses_list,
- &tcon->dfs_ses_list);
- queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
- dfs_cache_get_ttl() * HZ);
- } else {
- dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list);
- }
+ if (list_empty(&tcon->dfs_ses_list)) {
+ list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list);
+ queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
+ dfs_cache_get_ttl() * HZ);
+ } else {
+ dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list);
}
out:
kfree(origin_fullpath);
- kfree(ref_path);
- kfree(full_path);
+ return rc;
+}
+
+/* Resolve UNC hostname in @ctx->source and set ip addr in @ctx->dstaddr */
+static int update_fs_context_dstaddr(struct smb3_fs_context *ctx)
+{
+ struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
+ int rc;
+
+ rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
+ if (!rc)
+ cifs_set_port(addr, ctx->port);
return rc;
}
@@ -256,6 +281,10 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
bool nodfs = ctx->nodfs;
int rc;
+ rc = update_fs_context_dstaddr(ctx);
+ if (rc)
+ return rc;
+
*isdfs = false;
rc = get_session(mnt_ctx, NULL);
if (rc)
@@ -426,7 +455,7 @@ static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *t
/* Try to tree connect to all dfs targets */
for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
const char *target = dfs_cache_get_tgt_name(tit);
- struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl);
+ DFS_CACHE_TGT_LIST(ntl);
kfree(share);
kfree(prefix);
@@ -520,7 +549,7 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
int rc;
struct TCP_Server_Info *server = tcon->ses->server;
const struct smb_version_operations *ops = server->ops;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ DFS_CACHE_TGT_LIST(tl);
struct cifs_sb_info *cifs_sb = NULL;
struct super_block *sb = NULL;
struct dfs_info3_param ref = {0};
diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h
index 98e9d2aca6a7..875ab7ae57fc 100644
--- a/fs/smb/client/dfs.h
+++ b/fs/smb/client/dfs.h
@@ -9,6 +9,110 @@
#include "cifsglob.h"
#include "fs_context.h"
#include "cifs_unicode.h"
+#include <linux/namei.h>
+
+#define DFS_INTERLINK(v) \
+ (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER))
+
+struct dfs_ref {
+ char *path;
+ char *full_path;
+ struct dfs_cache_tgt_list tl;
+ struct dfs_cache_tgt_iterator *tit;
+};
+
+struct dfs_ref_walk {
+ struct dfs_ref *ref;
+ struct dfs_ref refs[MAX_NESTED_LINKS];
+};
+
+#define ref_walk_start(w) ((w)->refs)
+#define ref_walk_end(w) (&(w)->refs[ARRAY_SIZE((w)->refs) - 1])
+#define ref_walk_cur(w) ((w)->ref)
+#define ref_walk_descend(w) (--ref_walk_cur(w) >= ref_walk_start(w))
+
+#define ref_walk_tit(w) (ref_walk_cur(w)->tit)
+#define ref_walk_empty(w) (!ref_walk_tit(w))
+#define ref_walk_path(w) (ref_walk_cur(w)->path)
+#define ref_walk_fpath(w) (ref_walk_cur(w)->full_path)
+#define ref_walk_tl(w) (&ref_walk_cur(w)->tl)
+
+static inline struct dfs_ref_walk *ref_walk_alloc(void)
+{
+ struct dfs_ref_walk *rw;
+
+ rw = kmalloc(sizeof(*rw), GFP_KERNEL);
+ if (!rw)
+ return ERR_PTR(-ENOMEM);
+ return rw;
+}
+
+static inline void ref_walk_init(struct dfs_ref_walk *rw)
+{
+ memset(rw, 0, sizeof(*rw));
+ ref_walk_cur(rw) = ref_walk_start(rw);
+}
+
+static inline void __ref_walk_free(struct dfs_ref *ref)
+{
+ kfree(ref->path);
+ kfree(ref->full_path);
+ dfs_cache_free_tgts(&ref->tl);
+ memset(ref, 0, sizeof(*ref));
+}
+
+static inline void ref_walk_free(struct dfs_ref_walk *rw)
+{
+ struct dfs_ref *ref = ref_walk_start(rw);
+
+ for (; ref <= ref_walk_end(rw); ref++)
+ __ref_walk_free(ref);
+ kfree(rw);
+}
+
+static inline int ref_walk_advance(struct dfs_ref_walk *rw)
+{
+ struct dfs_ref *ref = ref_walk_cur(rw) + 1;
+
+ if (ref > ref_walk_end(rw))
+ return -ELOOP;
+ __ref_walk_free(ref);
+ ref_walk_cur(rw) = ref;
+ return 0;
+}
+
+static inline struct dfs_cache_tgt_iterator *
+ref_walk_next_tgt(struct dfs_ref_walk *rw)
+{
+ struct dfs_cache_tgt_iterator *tit;
+ struct dfs_ref *ref = ref_walk_cur(rw);
+
+ if (!ref->tit)
+ tit = dfs_cache_get_tgt_iterator(&ref->tl);
+ else
+ tit = dfs_cache_get_next_tgt(&ref->tl, ref->tit);
+ ref->tit = tit;
+ return tit;
+}
+
+static inline int ref_walk_get_tgt(struct dfs_ref_walk *rw,
+ struct dfs_info3_param *tgt)
+{
+ zfree_dfs_info_param(tgt);
+ return dfs_cache_get_tgt_referral(ref_walk_path(rw) + 1,
+ ref_walk_tit(rw), tgt);
+}
+
+static inline int ref_walk_num_tgts(struct dfs_ref_walk *rw)
+{
+ return dfs_cache_get_nr_tgts(ref_walk_tl(rw));
+}
+
+static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw)
+{
+ dfs_cache_noreq_update_tgthint(ref_walk_path(rw) + 1,
+ ref_walk_tit(rw));
+}
struct dfs_root_ses {
struct list_head list;
@@ -34,43 +138,6 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p
cifs_remap(cifs_sb), path, ref, tl);
}
-/* Return DFS full path out of a dentry set for automount */
-static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page)
-{
- struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
- size_t len;
- char *s;
-
- spin_lock(&tcon->tc_lock);
- if (unlikely(!tcon->origin_fullpath)) {
- spin_unlock(&tcon->tc_lock);
- return ERR_PTR(-EREMOTE);
- }
- spin_unlock(&tcon->tc_lock);
-
- s = dentry_path_raw(dentry, page, PATH_MAX);
- if (IS_ERR(s))
- return s;
- /* for root, we want "" */
- if (!s[1])
- s++;
-
- spin_lock(&tcon->tc_lock);
- len = strlen(tcon->origin_fullpath);
- if (s < (char *)page + len) {
- spin_unlock(&tcon->tc_lock);
- return ERR_PTR(-ENAMETOOLONG);
- }
-
- s -= len;
- memcpy(s, tcon->origin_fullpath, len);
- spin_unlock(&tcon->tc_lock);
- convert_delimiter(s, '/');
-
- return s;
-}
-
static inline void dfs_put_root_smb_sessions(struct list_head *head)
{
struct dfs_root_ses *root, *tmp;
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 33adf43a01f1..508d831fabe3 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -29,8 +29,6 @@
#define CACHE_MIN_TTL 120 /* 2 minutes */
#define CACHE_DEFAULT_TTL 300 /* 5 minutes */
-#define IS_DFS_INTERLINK(v) (((v) & DFSREF_REFERRAL_SERVER) && !((v) & DFSREF_STORAGE_SERVER))
-
struct cache_dfs_tgt {
char *name;
int path_consumed;
@@ -174,7 +172,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v)
"cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n",
ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link",
ce->ttl, ce->etime.tv_nsec, ce->hdr_flags, ce->ref_flags,
- IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no",
+ DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no",
ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no");
list_for_each_entry(t, &ce->tlist, list) {
@@ -243,7 +241,7 @@ static inline void dump_ce(const struct cache_entry *ce)
ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl,
ce->etime.tv_nsec,
ce->hdr_flags, ce->ref_flags,
- IS_DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no",
+ DFS_INTERLINK(ce->hdr_flags) ? "yes" : "no",
ce->path_consumed,
cache_entry_expired(ce) ? "yes" : "no");
dump_tgts(ce);
@@ -1177,9 +1175,9 @@ static bool is_ses_good(struct cifs_ses *ses)
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh)
{
- struct dfs_cache_tgt_list old_tl = DFS_CACHE_TGT_LIST_INIT(old_tl);
- struct dfs_cache_tgt_list new_tl = DFS_CACHE_TGT_LIST_INIT(new_tl);
struct TCP_Server_Info *server = ses->server;
+ DFS_CACHE_TGT_LIST(old_tl);
+ DFS_CACHE_TGT_LIST(new_tl);
bool needs_refresh = false;
struct cache_entry *ce;
unsigned int xid;
diff --git a/fs/smb/client/dfs_cache.h b/fs/smb/client/dfs_cache.h
index c6d89cd6d4fd..18a08a2ca93b 100644
--- a/fs/smb/client/dfs_cache.h
+++ b/fs/smb/client/dfs_cache.h
@@ -16,7 +16,11 @@
extern struct workqueue_struct *dfscache_wq;
extern atomic_t dfs_cache_ttl;
-#define DFS_CACHE_TGT_LIST_INIT(var) { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), }
+#define DFS_CACHE_TGT_LIST_INIT(var) \
+ { .tl_numtgts = 0, .tl_list = LIST_HEAD_INIT((var).tl_list), }
+
+#define DFS_CACHE_TGT_LIST(var) \
+ struct dfs_cache_tgt_list var = DFS_CACHE_TGT_LIST_INIT(var)
struct dfs_cache_tgt_list {
int tl_numtgts;
@@ -51,8 +55,8 @@ static inline struct dfs_cache_tgt_iterator *
dfs_cache_get_next_tgt(struct dfs_cache_tgt_list *tl,
struct dfs_cache_tgt_iterator *it)
{
- if (!tl || list_empty(&tl->tl_list) || !it ||
- list_is_last(&it->it_list, &tl->tl_list))
+ if (!tl || !tl->tl_numtgts || list_empty(&tl->tl_list) ||
+ !it || list_is_last(&it->it_list, &tl->tl_list))
return NULL;
return list_next_entry(it, it_list);
}
@@ -71,7 +75,7 @@ static inline void dfs_cache_free_tgts(struct dfs_cache_tgt_list *tl)
{
struct dfs_cache_tgt_iterator *it, *nit;
- if (!tl || list_empty(&tl->tl_list))
+ if (!tl || !tl->tl_numtgts || list_empty(&tl->tl_list))
return;
list_for_each_entry_safe(it, nit, &tl->tl_list, it_list) {
list_del(&it->it_list);
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 30b1e1bfd204..580a27a3a7e6 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -797,7 +797,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
const struct dentry_operations cifs_dentry_ops = {
.d_revalidate = cifs_d_revalidate,
- .d_automount = cifs_dfs_d_automount,
+ .d_automount = cifs_d_automount,
/* d_delete: cifs_d_delete, */ /* not needed except for debugging */
};
@@ -872,5 +872,5 @@ const struct dentry_operations cifs_ci_dentry_ops = {
.d_revalidate = cifs_d_revalidate,
.d_hash = cifs_ci_hash,
.d_compare = cifs_ci_compare,
- .d_automount = cifs_dfs_d_automount,
+ .d_automount = cifs_d_automount,
};
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 6bc44f79d2e9..2108b3b40ce9 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -1085,7 +1085,7 @@ int cifs_close(struct inode *inode, struct file *file)
!test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
dclose) {
if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
}
spin_lock(&cinode->deferred_lock);
cifs_add_deferred_close(cfile, dclose);
@@ -2596,7 +2596,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
write_data, to - from, &offset);
cifsFileInfo_put(open_file);
/* Does mm or vfs already set times? */
- inode->i_atime = inode->i_mtime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
if ((bytes_written > 0) && (offset))
rc = 0;
else if (bytes_written < 0)
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 67e16c2ac90e..a3493da12ad1 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -150,6 +150,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("closetimeo", Opt_closetimeo),
fsparam_u32("echo_interval", Opt_echo_interval),
fsparam_u32("max_credits", Opt_max_credits),
+ fsparam_u32("max_cached_dirs", Opt_max_cached_dirs),
fsparam_u32("handletimeout", Opt_handletimeout),
fsparam_u64("snapshot", Opt_snapshot),
fsparam_u32("max_channels", Opt_max_channels),
@@ -1165,6 +1166,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (result.uint_32 > 1)
ctx->multichannel = true;
break;
+ case Opt_max_cached_dirs:
+ if (result.uint_32 < 1) {
+ cifs_errorf(fc, "%s: Invalid max_cached_dirs, needs to be 1 or more\n",
+ __func__);
+ goto cifs_parse_mount_err;
+ }
+ ctx->max_cached_dirs = result.uint_32;
+ break;
case Opt_handletimeout:
ctx->handle_timeout = result.uint_32;
if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) {
@@ -1532,6 +1541,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_parse_mount_err:
kfree_sensitive(ctx->password);
+ ctx->password = NULL;
return -EINVAL;
}
@@ -1592,7 +1602,7 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->acregmax = CIFS_DEF_ACTIMEO;
ctx->acdirmax = CIFS_DEF_ACTIMEO;
ctx->closetimeo = SMB3_DEF_DCLOSETIMEO;
-
+ ctx->max_cached_dirs = MAX_CACHED_FIDS;
/* Most clients set timeout to 0, allows server to use its default */
ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index f4eaf8558902..9d8d34af0211 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -128,6 +128,7 @@ enum cifs_param {
Opt_closetimeo,
Opt_echo_interval,
Opt_max_credits,
+ Opt_max_cached_dirs,
Opt_snapshot,
Opt_max_channels,
Opt_handletimeout,
@@ -261,6 +262,7 @@ struct smb3_fs_context {
__u32 handle_timeout; /* persistent and durable handle timeout in ms */
unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
unsigned int max_channels;
+ unsigned int max_cached_dirs;
__u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
bool rootfs:1; /* if it's a SMB root file system */
bool witness:1; /* use witness protocol */
@@ -287,7 +289,7 @@ extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb);
*/
#define SMB3_MAX_DCLOSETIMEO (1 << 30)
#define SMB3_DEF_DCLOSETIMEO (1 * HZ) /* even 1 sec enough to help eg open/write/close/open/read */
-
+#define MAX_CACHED_FIDS 16
extern char *cifs_sanitize_prepath(char *prepath, gfp_t gfp);
#endif
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index 8f6909d633da..e5cad149f5a2 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -48,7 +48,7 @@ int cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
sharename = extract_sharename(tcon->tree_name);
if (IS_ERR(sharename)) {
cifs_dbg(FYI, "%s: couldn't extract sharename\n", __func__);
- return -EINVAL;
+ return PTR_ERR(sharename);
}
slen = strlen(sharename);
@@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
&cifsi->uniqueid, sizeof(cifsi->uniqueid),
&cd, sizeof(cd),
i_size_read(&cifsi->netfs.inode));
+ if (cifsi->netfs.cache)
+ mapping_set_release_always(inode->i_mapping);
}
void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index 173999610997..84f3b09367d2 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -50,12 +50,13 @@ void cifs_fscache_fill_coherency(struct inode *inode,
struct cifs_fscache_inode_coherency_data *cd)
{
struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ struct timespec64 ctime = inode_get_ctime(inode);
memset(cd, 0, sizeof(*cd));
cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec);
cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec);
- cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec);
- cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec);
+ cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec);
+ cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec);
}
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index c3eeae07e139..d7c302442c1e 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -58,13 +58,9 @@ static void cifs_set_ops(struct inode *inode)
inode->i_data.a_ops = &cifs_addr_ops;
break;
case S_IFDIR:
-#ifdef CONFIG_CIFS_DFS_UPCALL
if (IS_AUTOMOUNT(inode)) {
- inode->i_op = &cifs_dfs_referral_inode_operations;
+ inode->i_op = &cifs_namespace_inode_operations;
} else {
-#else /* NO DFS support, treat as a directory */
- {
-#endif
inode->i_op = &cifs_dir_inode_ops;
inode->i_fop = &cifs_dir_ops;
}
@@ -172,7 +168,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
else
inode->i_atime = fattr->cf_atime;
inode->i_mtime = fattr->cf_mtime;
- inode->i_ctime = fattr->cf_ctime;
+ inode_set_ctime_to_ts(inode, fattr->cf_ctime);
inode->i_rdev = fattr->cf_rdev;
cifs_nlink_fattr_to_inode(inode, fattr);
inode->i_uid = fattr->cf_uid;
@@ -218,7 +214,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
}
spin_unlock(&inode->i_lock);
- if (fattr->cf_flags & CIFS_FATTR_DFS_REFERRAL)
+ if (fattr->cf_flags & CIFS_FATTR_JUNCTION)
inode->i_flags |= S_AUTOMOUNT;
if (inode->i_state & I_NEW)
cifs_set_ops(inode);
@@ -327,14 +323,14 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
*
* Needed to setup cifs_fattr data for the directory which is the
* junction to the new submount (ie to setup the fake directory
- * which represents a DFS referral).
+ * which represents a DFS referral or reparse mount point).
*/
-static void
-cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
+static void cifs_create_junction_fattr(struct cifs_fattr *fattr,
+ struct super_block *sb)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- cifs_dbg(FYI, "creating fake fattr for DFS referral\n");
+ cifs_dbg(FYI, "%s: creating fake fattr\n", __func__);
memset(fattr, 0, sizeof(*fattr));
fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU;
@@ -343,7 +339,33 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
ktime_get_coarse_real_ts64(&fattr->cf_mtime);
fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime;
fattr->cf_nlink = 2;
- fattr->cf_flags = CIFS_FATTR_DFS_REFERRAL;
+ fattr->cf_flags = CIFS_FATTR_JUNCTION;
+}
+
+/* Update inode with final fattr data */
+static int update_inode_info(struct super_block *sb,
+ struct cifs_fattr *fattr,
+ struct inode **inode)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ int rc = 0;
+
+ if (!*inode) {
+ *inode = cifs_iget(sb, fattr);
+ if (!*inode)
+ rc = -ENOMEM;
+ return rc;
+ }
+ /* We already have inode, update it.
+ *
+ * If file type or uniqueid is different, return error.
+ */
+ if (unlikely((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+ CIFS_I(*inode)->uniqueid != fattr->cf_uniqueid)) {
+ CIFS_I(*inode)->time = 0; /* force reval */
+ return -ESTALE;
+ }
+ return cifs_fattr_to_inode(*inode, fattr);
}
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
@@ -373,7 +395,7 @@ cifs_get_file_info_unix(struct file *filp)
if (!rc) {
cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
} else if (rc == -EREMOTE) {
- cifs_create_dfs_fattr(&fattr, inode->i_sb);
+ cifs_create_junction_fattr(&fattr, inode->i_sb);
rc = 0;
} else
goto cifs_gfiunix_out;
@@ -385,17 +407,18 @@ cifs_gfiunix_out:
return rc;
}
-int cifs_get_inode_info_unix(struct inode **pinode,
- const unsigned char *full_path,
- struct super_block *sb, unsigned int xid)
+static int cifs_get_unix_fattr(const unsigned char *full_path,
+ struct super_block *sb,
+ struct cifs_fattr *fattr,
+ struct inode **pinode,
+ const unsigned int xid)
{
- int rc;
+ struct TCP_Server_Info *server;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
FILE_UNIX_BASIC_INFO find_data;
- struct cifs_fattr fattr;
struct cifs_tcon *tcon;
- struct TCP_Server_Info *server;
struct tcon_link *tlink;
- struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ int rc, tmprc;
cifs_dbg(FYI, "Getting info on %s\n", full_path);
@@ -412,59 +435,61 @@ int cifs_get_inode_info_unix(struct inode **pinode,
cifs_put_tlink(tlink);
if (!rc) {
- cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
+ cifs_unix_basic_to_fattr(fattr, &find_data, cifs_sb);
} else if (rc == -EREMOTE) {
- cifs_create_dfs_fattr(&fattr, sb);
+ cifs_create_junction_fattr(fattr, sb);
rc = 0;
} else {
return rc;
}
+ if (!*pinode)
+ cifs_fill_uniqueid(sb, fattr);
+
/* check for Minshall+French symlinks */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
- int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
- full_path);
- if (tmprc)
- cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
+ tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path);
+ cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
- if (S_ISLNK(fattr.cf_mode) && !fattr.cf_symlink_target) {
+ if (S_ISLNK(fattr->cf_mode) && !fattr->cf_symlink_target) {
if (!server->ops->query_symlink)
return -EOPNOTSUPP;
- rc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
- &fattr.cf_symlink_target, false);
- if (rc) {
- cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc);
- goto cgiiu_exit;
- }
+ rc = server->ops->query_symlink(xid, tcon,
+ cifs_sb, full_path,
+ &fattr->cf_symlink_target,
+ NULL);
+ cifs_dbg(FYI, "%s: query_symlink: %d\n", __func__, rc);
}
+ return rc;
+}
- if (*pinode == NULL) {
- /* get new inode */
- cifs_fill_uniqueid(sb, &fattr);
- *pinode = cifs_iget(sb, &fattr);
- if (!*pinode)
- rc = -ENOMEM;
- } else {
- /* we already have inode, update it */
-
- /* if uniqueid is different, return error */
- if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
- CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) {
- CIFS_I(*pinode)->time = 0; /* force reval */
- rc = -ESTALE;
- goto cgiiu_exit;
- }
+int cifs_get_inode_info_unix(struct inode **pinode,
+ const unsigned char *full_path,
+ struct super_block *sb, unsigned int xid)
+{
+ struct cifs_fattr fattr = {};
+ int rc;
- /* if filetype is different, return error */
- rc = cifs_fattr_to_inode(*pinode, &fattr);
- }
+ rc = cifs_get_unix_fattr(full_path, sb, &fattr, pinode, xid);
+ if (rc)
+ goto out;
-cgiiu_exit:
+ rc = update_inode_info(sb, &fattr, pinode);
+out:
kfree(fattr.cf_symlink_target);
return rc;
}
#else
+static inline int cifs_get_unix_fattr(const unsigned char *full_path,
+ struct super_block *sb,
+ struct cifs_fattr *fattr,
+ struct inode **pinode,
+ const unsigned int xid)
+{
+ return -EOPNOTSUPP;
+}
+
int cifs_get_inode_info_unix(struct inode **pinode,
const unsigned char *full_path,
struct super_block *sb, unsigned int xid)
@@ -632,10 +657,11 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
}
/* Fill a cifs_fattr struct with info from POSIX info struct */
-static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
+ struct cifs_open_info_data *data,
struct cifs_sid *owner,
struct cifs_sid *group,
- struct super_block *sb, bool adjust_tz, bool symlink)
+ struct super_block *sb)
{
struct smb311_posix_qinfo *info = &data->posix_fi;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -655,7 +681,7 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope
fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
- if (adjust_tz) {
+ if (data->adjust_tz) {
fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
}
@@ -669,7 +695,7 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope
/* The srv fs device id is overridden on network mount so setting rdev isn't needed here */
/* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */
- if (symlink) {
+ if (data->symlink) {
fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
fattr->cf_symlink_target = data->symlink_target;
@@ -690,9 +716,46 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
}
-static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
- struct super_block *sb, bool adjust_tz, bool symlink,
- u32 reparse_tag)
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+ struct cifs_fattr *fattr,
+ u32 tag)
+{
+ switch (tag) {
+ case IO_REPARSE_TAG_LX_SYMLINK:
+ fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_LNK;
+ break;
+ case IO_REPARSE_TAG_LX_FIFO:
+ fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_FIFO;
+ break;
+ case IO_REPARSE_TAG_AF_UNIX:
+ fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_SOCK;
+ break;
+ case IO_REPARSE_TAG_LX_CHR:
+ fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_CHR;
+ break;
+ case IO_REPARSE_TAG_LX_BLK:
+ fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_BLK;
+ break;
+ case 0: /* SMB1 symlink */
+ case IO_REPARSE_TAG_SYMLINK:
+ case IO_REPARSE_TAG_NFS:
+ fattr->cf_mode = S_IFLNK;
+ fattr->cf_dtype = DT_LNK;
+ break;
+ default:
+ return false;
+ }
+ return true;
+}
+
+static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
+ struct cifs_open_info_data *data,
+ struct super_block *sb)
{
struct smb2_file_all_info *info = &data->fi;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -711,7 +774,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i
fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime);
fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime);
- if (adjust_tz) {
+ if (data->adjust_tz) {
fattr->cf_ctime.tv_sec += tcon->ses->server->timeAdj;
fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
}
@@ -719,28 +782,13 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i
fattr->cf_eof = le64_to_cpu(info->EndOfFile);
fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
-
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
- if (reparse_tag == IO_REPARSE_TAG_LX_SYMLINK) {
- fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_LNK;
- } else if (reparse_tag == IO_REPARSE_TAG_LX_FIFO) {
- fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_FIFO;
- } else if (reparse_tag == IO_REPARSE_TAG_AF_UNIX) {
- fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_SOCK;
- } else if (reparse_tag == IO_REPARSE_TAG_LX_CHR) {
- fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_CHR;
- } else if (reparse_tag == IO_REPARSE_TAG_LX_BLK) {
- fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_BLK;
- } else if (symlink || reparse_tag == IO_REPARSE_TAG_SYMLINK ||
- reparse_tag == IO_REPARSE_TAG_NFS) {
- fattr->cf_mode = S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+
+ if (cifs_open_data_reparse(data) &&
+ cifs_reparse_point_to_fattr(cifs_sb, fattr, data->reparse_tag))
+ goto out_reparse;
+
+ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->ctx->dir_mode;
fattr->cf_dtype = DT_DIR;
/*
@@ -769,6 +817,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_i
}
}
+out_reparse:
if (S_ISLNK(fattr->cf_mode)) {
fattr->cf_symlink_target = data->symlink_target;
data->symlink_target = NULL;
@@ -789,8 +838,6 @@ cifs_get_file_info(struct file *filp)
struct cifsFileInfo *cfile = filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
- bool symlink = false;
- u32 tag = 0;
if (!server->ops->query_file_info)
return -ENOSYS;
@@ -800,14 +847,15 @@ cifs_get_file_info(struct file *filp)
switch (rc) {
case 0:
/* TODO: add support to query reparse tag */
+ data.adjust_tz = false;
if (data.symlink_target) {
- symlink = true;
- tag = IO_REPARSE_TAG_SYMLINK;
+ data.symlink = true;
+ data.reparse_tag = IO_REPARSE_TAG_SYMLINK;
}
- cifs_open_info_to_fattr(&fattr, &data, inode->i_sb, false, symlink, tag);
+ cifs_open_info_to_fattr(&fattr, &data, inode->i_sb);
break;
case -EREMOTE:
- cifs_create_dfs_fattr(&fattr, inode->i_sb);
+ cifs_create_junction_fattr(&fattr, inode->i_sb);
rc = 0;
break;
case -EOPNOTSUPP:
@@ -960,22 +1008,66 @@ static inline bool is_inode_cache_good(struct inode *ino)
return ino && CIFS_CACHE_READ(CIFS_I(ino)) && CIFS_I(ino)->time != 0;
}
-int cifs_get_inode_info(struct inode **inode, const char *full_path,
- struct cifs_open_info_data *data, struct super_block *sb, int xid,
- const struct cifs_fid *fid)
+static int reparse_info_to_fattr(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ struct cifs_fattr *fattr)
+{
+ struct TCP_Server_Info *server = tcon->ses->server;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct kvec rsp_iov, *iov = NULL;
+ int rsp_buftype = CIFS_NO_BUFFER;
+ u32 tag = data->reparse_tag;
+ int rc = 0;
+
+ if (!tag && server->ops->query_reparse_point) {
+ rc = server->ops->query_reparse_point(xid, tcon, cifs_sb,
+ full_path, &tag,
+ &rsp_iov, &rsp_buftype);
+ if (!rc)
+ iov = &rsp_iov;
+ }
+ switch ((data->reparse_tag = tag)) {
+ case 0: /* SMB1 symlink */
+ iov = NULL;
+ fallthrough;
+ case IO_REPARSE_TAG_NFS:
+ case IO_REPARSE_TAG_SYMLINK:
+ if (!data->symlink_target && server->ops->query_symlink) {
+ rc = server->ops->query_symlink(xid, tcon,
+ cifs_sb, full_path,
+ &data->symlink_target,
+ iov);
+ }
+ break;
+ case IO_REPARSE_TAG_MOUNT_POINT:
+ cifs_create_junction_fattr(fattr, sb);
+ goto out;
+ }
+
+ cifs_open_info_to_fattr(fattr, data, sb);
+out:
+ free_rsp_buf(rsp_buftype, rsp_iov.iov_base);
+ return rc;
+}
+
+static int cifs_get_fattr(struct cifs_open_info_data *data,
+ struct super_block *sb, int xid,
+ const struct cifs_fid *fid,
+ struct cifs_fattr *fattr,
+ struct inode **inode,
+ const char *full_path)
{
+ struct cifs_open_info_data tmp_data = {};
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- bool adjust_tz = false;
- struct cifs_fattr fattr = {0};
- bool is_reparse_point = false;
- struct cifs_open_info_data tmp_data = {};
void *smb1_backup_rsp_buf = NULL;
int rc = 0;
int tmprc = 0;
- __u32 reparse_tag = 0;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -988,12 +1080,8 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
*/
if (!data) {
- if (is_inode_cache_good(*inode)) {
- cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
- goto out;
- }
- rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data,
- &adjust_tz, &is_reparse_point);
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb,
+ full_path, &tmp_data);
data = &tmp_data;
}
@@ -1008,28 +1096,16 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
* since we have to check if its reparse tag matches a known
* special file type e.g. symlink or fifo or char etc.
*/
- if (is_reparse_point && data->symlink_target) {
- reparse_tag = IO_REPARSE_TAG_SYMLINK;
- } else if ((le32_to_cpu(data->fi.Attributes) & ATTR_REPARSE) &&
- server->ops->query_reparse_tag) {
- tmprc = server->ops->query_reparse_tag(xid, tcon, cifs_sb, full_path,
- &reparse_tag);
- if (tmprc)
- cifs_dbg(FYI, "%s: query_reparse_tag: rc = %d\n", __func__, tmprc);
- if (server->ops->query_symlink) {
- tmprc = server->ops->query_symlink(xid, tcon, cifs_sb, full_path,
- &data->symlink_target,
- is_reparse_point);
- if (tmprc)
- cifs_dbg(FYI, "%s: query_symlink: rc = %d\n", __func__,
- tmprc);
- }
+ if (cifs_open_data_reparse(data)) {
+ rc = reparse_info_to_fattr(data, sb, xid, tcon,
+ full_path, fattr);
+ } else {
+ cifs_open_info_to_fattr(fattr, data, sb);
}
- cifs_open_info_to_fattr(&fattr, data, sb, adjust_tz, is_reparse_point, reparse_tag);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
- cifs_create_dfs_fattr(&fattr, sb);
+ cifs_create_junction_fattr(fattr, sb);
rc = 0;
break;
case -EACCES:
@@ -1059,8 +1135,8 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
fdi = (FILE_DIRECTORY_INFO *)fi;
si = (SEARCH_ID_FULL_DIR_INFO *)fi;
- cifs_dir_info_to_fattr(&fattr, fdi, cifs_sb);
- fattr.cf_uniqueid = le64_to_cpu(si->UniqueId);
+ cifs_dir_info_to_fattr(fattr, fdi, cifs_sb);
+ fattr->cf_uniqueid = le64_to_cpu(si->UniqueId);
/* uniqueid set, skip get inum step */
goto handle_mnt_opt;
} else {
@@ -1077,10 +1153,10 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
}
/*
- * 3. Get or update inode number (fattr.cf_uniqueid)
+ * 3. Get or update inode number (fattr->cf_uniqueid)
*/
- cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, &fattr);
+ cifs_set_fattr_ino(xid, tcon, sb, inode, full_path, data, fattr);
/*
* 4. Tweak fattr based on mount options
@@ -1089,17 +1165,17 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
handle_mnt_opt:
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
/* query for SFU type info if supported and needed */
- if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
- tmprc = cifs_sfu_type(&fattr, full_path, cifs_sb, xid);
+ if ((fattr->cf_cifsattrs & ATTR_SYSTEM) &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
+ tmprc = cifs_sfu_type(fattr, full_path, cifs_sb, xid);
if (tmprc)
cifs_dbg(FYI, "cifs_sfu_type failed: %d\n", tmprc);
}
/* fill in 0777 bits from ACL */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) {
- rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true,
- full_path, fid);
+ rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode,
+ true, full_path, fid);
if (rc == -EREMOTE)
rc = 0;
if (rc) {
@@ -1108,8 +1184,8 @@ handle_mnt_opt:
goto out;
}
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
- rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false,
- full_path, fid);
+ rc = cifs_acl_to_fattr(cifs_sb, fattr, *inode,
+ false, full_path, fid);
if (rc == -EREMOTE)
rc = 0;
if (rc) {
@@ -1121,60 +1197,57 @@ handle_mnt_opt:
/* fill in remaining high mode bits e.g. SUID, VTX */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
- cifs_sfu_mode(&fattr, full_path, cifs_sb, xid);
+ cifs_sfu_mode(fattr, full_path, cifs_sb, xid);
/* check for Minshall+French symlinks */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
- tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
- full_path);
- if (tmprc)
- cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
+ tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path);
+ cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
- /*
- * 5. Update inode with final fattr data
- */
-
- if (!*inode) {
- *inode = cifs_iget(sb, &fattr);
- if (!*inode)
- rc = -ENOMEM;
- } else {
- /* we already have inode, update it */
-
- /* if uniqueid is different, return error */
- if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
- CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
- CIFS_I(*inode)->time = 0; /* force reval */
- rc = -ESTALE;
- goto out;
- }
- /* if filetype is different, return error */
- rc = cifs_fattr_to_inode(*inode, &fattr);
- }
out:
cifs_buf_release(smb1_backup_rsp_buf);
cifs_put_tlink(tlink);
cifs_free_open_info(&tmp_data);
+ return rc;
+}
+
+int cifs_get_inode_info(struct inode **inode,
+ const char *full_path,
+ struct cifs_open_info_data *data,
+ struct super_block *sb, int xid,
+ const struct cifs_fid *fid)
+{
+ struct cifs_fattr fattr = {};
+ int rc;
+
+ if (is_inode_cache_good(*inode)) {
+ cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
+ return 0;
+ }
+
+ rc = cifs_get_fattr(data, sb, xid, fid, &fattr, inode, full_path);
+ if (rc)
+ goto out;
+
+ rc = update_inode_info(sb, &fattr, inode);
+out:
kfree(fattr.cf_symlink_target);
return rc;
}
-int
-smb311_posix_get_inode_info(struct inode **inode,
- const char *full_path,
- struct super_block *sb, unsigned int xid)
+static int smb311_posix_get_fattr(struct cifs_fattr *fattr,
+ const char *full_path,
+ struct super_block *sb,
+ const unsigned int xid)
{
+ struct cifs_open_info_data data = {};
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct tcon_link *tlink;
- struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- bool adjust_tz = false;
- struct cifs_fattr fattr = {0};
- bool symlink = false;
- struct cifs_open_info_data data = {};
struct cifs_sid owner, group;
- int rc = 0;
- int tmprc = 0;
+ int tmprc;
+ int rc;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -1185,14 +1258,9 @@ smb311_posix_get_inode_info(struct inode **inode,
* 1. Fetch file metadata
*/
- if (is_inode_cache_good(*inode)) {
- cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
- goto out;
- }
-
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data,
- &owner, &group, &adjust_tz,
- &symlink);
+ rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
+ full_path, &data,
+ &owner, &group);
/*
* 2. Convert it to internal cifs metadata (fattr)
@@ -1200,12 +1268,11 @@ smb311_posix_get_inode_info(struct inode **inode,
switch (rc) {
case 0:
- smb311_posix_info_to_fattr(&fattr, &data, &owner, &group,
- sb, adjust_tz, symlink);
+ smb311_posix_info_to_fattr(fattr, &data, &owner, &group, sb);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
- cifs_create_dfs_fattr(&fattr, sb);
+ cifs_create_junction_fattr(fattr, sb);
rc = 0;
break;
case -EACCES:
@@ -1221,49 +1288,42 @@ smb311_posix_get_inode_info(struct inode **inode,
goto out;
}
-
/*
* 3. Tweak fattr based on mount options
*/
-
/* check for Minshall+French symlinks */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
- tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
- full_path);
- if (tmprc)
- cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
+ tmprc = check_mf_symlink(xid, tcon, cifs_sb, fattr, full_path);
+ cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
- /*
- * 4. Update inode with final fattr data
- */
-
- if (!*inode) {
- *inode = cifs_iget(sb, &fattr);
- if (!*inode)
- rc = -ENOMEM;
- } else {
- /* we already have inode, update it */
+out:
+ cifs_put_tlink(tlink);
+ cifs_free_open_info(&data);
+ return rc;
+}
- /* if uniqueid is different, return error */
- if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
- CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
- CIFS_I(*inode)->time = 0; /* force reval */
- rc = -ESTALE;
- goto out;
- }
+int smb311_posix_get_inode_info(struct inode **inode, const char *full_path,
+ struct super_block *sb, const unsigned int xid)
+{
+ struct cifs_fattr fattr = {};
+ int rc;
- /* if filetype is different, return error */
- rc = cifs_fattr_to_inode(*inode, &fattr);
+ if (is_inode_cache_good(*inode)) {
+ cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
+ return 0;
}
+
+ rc = smb311_posix_get_fattr(&fattr, full_path, sb, xid);
+ if (rc)
+ goto out;
+
+ rc = update_inode_info(sb, &fattr, inode);
out:
- cifs_put_tlink(tlink);
- cifs_free_open_info(&data);
kfree(fattr.cf_symlink_target);
return rc;
}
-
static const struct inode_operations cifs_ipc_inode_ops = {
.lookup = cifs_lookup,
};
@@ -1367,13 +1427,14 @@ retry_iget5_locked:
/* gets root inode */
struct inode *cifs_root_iget(struct super_block *sb)
{
- unsigned int xid;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
- struct inode *inode = NULL;
- long rc;
+ struct cifs_fattr fattr = {};
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct inode *inode = NULL;
+ unsigned int xid;
char *path = NULL;
int len;
+ int rc;
if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
&& cifs_sb->prepath) {
@@ -1391,21 +1452,29 @@ struct inode *cifs_root_iget(struct super_block *sb)
xid = get_xid();
if (tcon->unix_ext) {
- rc = cifs_get_inode_info_unix(&inode, path, sb, xid);
+ rc = cifs_get_unix_fattr(path, sb, &fattr, &inode, xid);
/* some servers mistakenly claim POSIX support */
if (rc != -EOPNOTSUPP)
- goto iget_no_retry;
+ goto iget_root;
cifs_dbg(VFS, "server does not support POSIX extensions\n");
tcon->unix_ext = false;
}
convert_delimiter(path, CIFS_DIR_SEP(cifs_sb));
if (tcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&inode, path, sb, xid);
+ rc = smb311_posix_get_fattr(&fattr, path, sb, xid);
else
- rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL);
+ rc = cifs_get_fattr(NULL, sb, xid, NULL, &fattr, &inode, path);
+
+iget_root:
+ if (!rc) {
+ if (fattr.cf_flags & CIFS_FATTR_JUNCTION) {
+ fattr.cf_flags &= ~CIFS_FATTR_JUNCTION;
+ cifs_autodisable_serverino(cifs_sb);
+ }
+ inode = cifs_iget(sb, &fattr);
+ }
-iget_no_retry:
if (!inode) {
inode = ERR_PTR(rc);
goto out;
@@ -1429,6 +1498,7 @@ iget_no_retry:
out:
kfree(path);
free_xid(xid);
+ kfree(fattr.cf_symlink_target);
return inode;
}
@@ -1744,9 +1814,9 @@ out_reval:
cifs_inode = CIFS_I(inode);
cifs_inode->time = 0; /* will force revalidate to get info
when needed */
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
}
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
cifs_inode = CIFS_I(dir);
CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
unlink_out:
@@ -2060,8 +2130,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
*/
cifsInode->time = 0;
- d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime =
- current_time(inode);
+ inode_set_ctime_current(d_inode(direntry));
+ inode->i_mtime = inode_set_ctime_current(inode);
rmdir_exit:
free_dentry_path(page);
@@ -2267,8 +2337,8 @@ unlink_target:
/* force revalidate to go get info when needed */
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
- source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
- target_dir->i_mtime = current_time(source_dir);
+ source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir,
+ inode_set_ctime_current(target_dir));
cifs_rename_exit:
kfree(info_buf_source);
@@ -2540,7 +2610,7 @@ int cifs_getattr(struct mnt_idmap *idmap, const struct path *path,
return rc;
}
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blksize = cifs_sb->ctx->bsize;
stat->ino = CIFS_I(inode)->uniqueid;
@@ -2610,7 +2680,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
}
cifsFileInfo_put(cfile);
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
int cifs_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index d7e85d9a2655..35b176457bbe 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -113,18 +113,22 @@ sesInfoFree(struct cifs_ses *buf_to_free)
}
struct cifs_tcon *
-tconInfoAlloc(void)
+tcon_info_alloc(bool dir_leases_enabled)
{
struct cifs_tcon *ret_buf;
ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL);
if (!ret_buf)
return NULL;
- ret_buf->cfids = init_cached_dirs();
- if (!ret_buf->cfids) {
- kfree(ret_buf);
- return NULL;
+
+ if (dir_leases_enabled == true) {
+ ret_buf->cfids = init_cached_dirs();
+ if (!ret_buf->cfids) {
+ kfree(ret_buf);
+ return NULL;
+ }
}
+ /* else ret_buf->cfids is already set to NULL above */
atomic_inc(&tconInfoAllocCount);
ret_buf->status = TID_NEW;
@@ -476,7 +480,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
return false;
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(srv) ? srv->primary_server : srv;
+ pserver = SERVER_IS_CHAN(srv) ? srv->primary_server : srv;
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
diff --git a/fs/smb/client/cifs_dfs_ref.c b/fs/smb/client/namespace.c
index b1c2499b1c3b..c8f5ed8a69f1 100644
--- a/fs/smb/client/cifs_dfs_ref.c
+++ b/fs/smb/client/namespace.c
@@ -1,12 +1,12 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Contains the CIFS DFS referral mounting routines used for handling
- * traversal via DFS junction point
+ * Contains mounting routines used for handling traversal via SMB junctions.
*
* Copyright (c) 2007 Igor Mammedov
* Copyright (C) International Business Machines Corp., 2008
* Author(s): Igor Mammedov (niallain@gmail.com)
* Steve French (sfrench@us.ibm.com)
+ * Copyright (c) 2023 Paulo Alcantara <palcantara@suse.de>
*/
#include <linux/dcache.h>
@@ -19,32 +19,31 @@
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifsfs.h"
-#include "dns_resolve.h"
#include "cifs_debug.h"
-#include "dfs.h"
#include "fs_context.h"
-static LIST_HEAD(cifs_dfs_automount_list);
+static LIST_HEAD(cifs_automount_list);
-static void cifs_dfs_expire_automounts(struct work_struct *work);
-static DECLARE_DELAYED_WORK(cifs_dfs_automount_task,
- cifs_dfs_expire_automounts);
-static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ;
+static void cifs_expire_automounts(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cifs_automount_task,
+ cifs_expire_automounts);
+static int cifs_mountpoint_expiry_timeout = 500 * HZ;
-static void cifs_dfs_expire_automounts(struct work_struct *work)
+static void cifs_expire_automounts(struct work_struct *work)
{
- struct list_head *list = &cifs_dfs_automount_list;
+ struct list_head *list = &cifs_automount_list;
mark_mounts_for_expiry(list);
if (!list_empty(list))
- schedule_delayed_work(&cifs_dfs_automount_task,
- cifs_dfs_mountpoint_expiry_timeout);
+ schedule_delayed_work(&cifs_automount_task,
+ cifs_mountpoint_expiry_timeout);
}
-void cifs_dfs_release_automount_timer(void)
+void cifs_release_automount_timer(void)
{
- BUG_ON(!list_empty(&cifs_dfs_automount_list));
- cancel_delayed_work_sync(&cifs_dfs_automount_task);
+ if (WARN_ON(!list_empty(&cifs_automount_list)))
+ return;
+ cancel_delayed_work_sync(&cifs_automount_task);
}
/**
@@ -118,26 +117,53 @@ cifs_build_devname(char *nodename, const char *prepath)
return dev;
}
-static int set_dest_addr(struct smb3_fs_context *ctx)
+/* Return full path out of a dentry set for automount */
+static char *automount_fullpath(struct dentry *dentry, void *page)
{
- struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
- int rc;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ size_t len;
+ char *s;
+
+ spin_lock(&tcon->tc_lock);
+ if (!tcon->origin_fullpath) {
+ spin_unlock(&tcon->tc_lock);
+ return build_path_from_dentry_optional_prefix(dentry,
+ page,
+ true);
+ }
+ spin_unlock(&tcon->tc_lock);
+
+ s = dentry_path_raw(dentry, page, PATH_MAX);
+ if (IS_ERR(s))
+ return s;
+ /* for root, we want "" */
+ if (!s[1])
+ s++;
+
+ spin_lock(&tcon->tc_lock);
+ len = strlen(tcon->origin_fullpath);
+ if (s < (char *)page + len) {
+ spin_unlock(&tcon->tc_lock);
+ return ERR_PTR(-ENAMETOOLONG);
+ }
+
+ s -= len;
+ memcpy(s, tcon->origin_fullpath, len);
+ spin_unlock(&tcon->tc_lock);
+ convert_delimiter(s, '/');
- rc = dns_resolve_server_name_to_ip(ctx->source, addr, NULL);
- if (!rc)
- cifs_set_port(addr, ctx->port);
- return rc;
+ return s;
}
/*
* Create a vfsmount that we can automount
*/
-static struct vfsmount *cifs_dfs_do_automount(struct path *path)
+static struct vfsmount *cifs_do_automount(struct path *path)
{
int rc;
struct dentry *mntpt = path->dentry;
struct fs_context *fc;
- struct cifs_sb_info *cifs_sb;
void *page = NULL;
struct smb3_fs_context *ctx, *cur_ctx;
struct smb3_fs_context tmp;
@@ -147,17 +173,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path)
if (IS_ROOT(mntpt))
return ERR_PTR(-ESTALE);
- /*
- * The MSDFS spec states that paths in DFS referral requests and
- * responses must be prefixed by a single '\' character instead of
- * the double backslashes usually used in the UNC. This function
- * gives us the latter, so we must adjust the result.
- */
- cifs_sb = CIFS_SB(mntpt->d_sb);
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
- return ERR_PTR(-EREMOTE);
-
- cur_ctx = cifs_sb->ctx;
+ cur_ctx = CIFS_SB(mntpt->d_sb)->ctx;
fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt);
if (IS_ERR(fc))
@@ -166,7 +182,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path)
ctx = smb3_fc2context(fc);
page = alloc_dentry_path();
- full_path = dfs_get_automount_devname(mntpt, page);
+ full_path = automount_fullpath(mntpt, page);
if (IS_ERR(full_path)) {
mnt = ERR_CAST(full_path);
goto out;
@@ -196,15 +212,10 @@ static struct vfsmount *cifs_dfs_do_automount(struct path *path)
ctx->source = NULL;
goto out;
}
- cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s dstaddr=%pISpc\n",
- __func__, ctx->source, ctx->UNC, ctx->prepath, &ctx->dstaddr);
-
- rc = set_dest_addr(ctx);
- if (!rc)
- mnt = fc_mount(fc);
- else
- mnt = ERR_PTR(rc);
+ cifs_dbg(FYI, "%s: ctx: source=%s UNC=%s prepath=%s\n",
+ __func__, ctx->source, ctx->UNC, ctx->prepath);
+ mnt = fc_mount(fc);
out:
put_fs_context(fc);
free_dentry_path(page);
@@ -214,25 +225,25 @@ out:
/*
* Attempt to automount the referral
*/
-struct vfsmount *cifs_dfs_d_automount(struct path *path)
+struct vfsmount *cifs_d_automount(struct path *path)
{
struct vfsmount *newmnt;
cifs_dbg(FYI, "%s: %pd\n", __func__, path->dentry);
- newmnt = cifs_dfs_do_automount(path);
+ newmnt = cifs_do_automount(path);
if (IS_ERR(newmnt)) {
cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__);
return newmnt;
}
mntget(newmnt); /* prevent immediate expiration */
- mnt_set_expiry(newmnt, &cifs_dfs_automount_list);
- schedule_delayed_work(&cifs_dfs_automount_task,
- cifs_dfs_mountpoint_expiry_timeout);
+ mnt_set_expiry(newmnt, &cifs_automount_list);
+ schedule_delayed_work(&cifs_automount_task,
+ cifs_mountpoint_expiry_timeout);
cifs_dbg(FYI, "leaving %s [ok]\n" , __func__);
return newmnt;
}
-const struct inode_operations cifs_dfs_referral_inode_operations = {
+const struct inode_operations cifs_namespace_inode_operations = {
};
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index ef638086d734..47fc22de8d20 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -143,6 +143,7 @@ static bool reparse_file_needs_reval(const struct cifs_fattr *fattr)
case IO_REPARSE_TAG_DFSR:
case IO_REPARSE_TAG_SYMLINK:
case IO_REPARSE_TAG_NFS:
+ case IO_REPARSE_TAG_MOUNT_POINT:
case 0:
return true;
}
@@ -163,29 +164,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
* TODO: go through all documented reparse tags to see if we can
* reasonably map some of them to directories vs. files vs. symlinks
*/
+ if ((fattr->cf_cifsattrs & ATTR_REPARSE) &&
+ cifs_reparse_point_to_fattr(cifs_sb, fattr, fattr->cf_cifstag))
+ goto out_reparse;
+
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->ctx->dir_mode;
fattr->cf_dtype = DT_DIR;
- } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_SYMLINK) {
- fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_LNK;
- } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_FIFO) {
- fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_FIFO;
- } else if (fattr->cf_cifstag == IO_REPARSE_TAG_AF_UNIX) {
- fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_SOCK;
- } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_CHR) {
- fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_CHR;
- } else if (fattr->cf_cifstag == IO_REPARSE_TAG_LX_BLK) {
- fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
- fattr->cf_dtype = DT_BLK;
- } else { /* TODO: should we mark some other reparse points (like DFSR) as directories? */
+ } else {
fattr->cf_mode = S_IFREG | cifs_sb->ctx->file_mode;
fattr->cf_dtype = DT_REG;
}
+out_reparse:
/*
* We need to revalidate it further to make a decision about whether it
* is a symbolic link, DFS referral or a reparse point with a direct
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index c57ca2050b73..79f26c560edf 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -323,12 +323,12 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
ses->chans[chan_index].iface = iface;
/* No iface is found. if secondary chan, drop connection */
- if (!iface && CIFS_SERVER_IS_CHAN(server))
+ if (!iface && SERVER_IS_CHAN(server))
ses->chans[chan_index].server = NULL;
spin_unlock(&ses->chan_lock);
- if (!iface && CIFS_SERVER_IS_CHAN(server))
+ if (!iface && SERVER_IS_CHAN(server))
cifs_put_tcp_session(server, false);
return rc;
@@ -360,11 +360,11 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
{
struct TCP_Server_Info *chan_server;
struct cifs_chan *chan;
- struct smb3_fs_context ctx = {NULL};
+ struct smb3_fs_context *ctx;
static const char unc_fmt[] = "\\%s\\foo";
- char unc[sizeof(unc_fmt)+SERVER_NAME_LEN_WITH_NULL] = {0};
struct sockaddr_in *ipv4 = (struct sockaddr_in *)&iface->sockaddr;
struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)&iface->sockaddr;
+ size_t len;
int rc;
unsigned int xid = get_xid();
@@ -388,54 +388,64 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
* the session and server without caring about memory
* management.
*/
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ rc = -ENOMEM;
+ goto out_free_xid;
+ }
/* Always make new connection for now (TODO?) */
- ctx.nosharesock = true;
+ ctx->nosharesock = true;
/* Auth */
- ctx.domainauto = ses->domainAuto;
- ctx.domainname = ses->domainName;
+ ctx->domainauto = ses->domainAuto;
+ ctx->domainname = ses->domainName;
/* no hostname for extra channels */
- ctx.server_hostname = "";
+ ctx->server_hostname = "";
- ctx.username = ses->user_name;
- ctx.password = ses->password;
- ctx.sectype = ses->sectype;
- ctx.sign = ses->sign;
+ ctx->username = ses->user_name;
+ ctx->password = ses->password;
+ ctx->sectype = ses->sectype;
+ ctx->sign = ses->sign;
/* UNC and paths */
/* XXX: Use ses->server->hostname? */
- sprintf(unc, unc_fmt, ses->ip_addr);
- ctx.UNC = unc;
- ctx.prepath = "";
+ len = sizeof(unc_fmt) + SERVER_NAME_LEN_WITH_NULL;
+ ctx->UNC = kzalloc(len, GFP_KERNEL);
+ if (!ctx->UNC) {
+ rc = -ENOMEM;
+ goto out_free_ctx;
+ }
+ scnprintf(ctx->UNC, len, unc_fmt, ses->ip_addr);
+ ctx->prepath = "";
/* Reuse same version as master connection */
- ctx.vals = ses->server->vals;
- ctx.ops = ses->server->ops;
+ ctx->vals = ses->server->vals;
+ ctx->ops = ses->server->ops;
- ctx.noblocksnd = ses->server->noblocksnd;
- ctx.noautotune = ses->server->noautotune;
- ctx.sockopt_tcp_nodelay = ses->server->tcp_nodelay;
- ctx.echo_interval = ses->server->echo_interval / HZ;
- ctx.max_credits = ses->server->max_credits;
+ ctx->noblocksnd = ses->server->noblocksnd;
+ ctx->noautotune = ses->server->noautotune;
+ ctx->sockopt_tcp_nodelay = ses->server->tcp_nodelay;
+ ctx->echo_interval = ses->server->echo_interval / HZ;
+ ctx->max_credits = ses->server->max_credits;
/*
* This will be used for encoding/decoding user/domain/pw
* during sess setup auth.
*/
- ctx.local_nls = cifs_sb->local_nls;
+ ctx->local_nls = cifs_sb->local_nls;
/* Use RDMA if possible */
- ctx.rdma = iface->rdma_capable;
- memcpy(&ctx.dstaddr, &iface->sockaddr, sizeof(struct sockaddr_storage));
+ ctx->rdma = iface->rdma_capable;
+ memcpy(&ctx->dstaddr, &iface->sockaddr, sizeof(ctx->dstaddr));
/* reuse master con client guid */
- memcpy(&ctx.client_guid, ses->server->client_guid,
- SMB2_CLIENT_GUID_SIZE);
- ctx.use_client_guid = true;
+ memcpy(&ctx->client_guid, ses->server->client_guid,
+ sizeof(ctx->client_guid));
+ ctx->use_client_guid = true;
- chan_server = cifs_get_tcp_session(&ctx, ses->server);
+ chan_server = cifs_get_tcp_session(ctx, ses->server);
spin_lock(&ses->chan_lock);
chan = &ses->chans[ses->chan_count];
@@ -497,6 +507,10 @@ out:
cifs_put_tcp_session(chan->server, 0);
}
+ kfree(ctx->UNC);
+out_free_ctx:
+ kfree(ctx);
+out_free_xid:
free_xid(xid);
return rc;
}
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 7d1b3fc014d9..9bf8735cdd1e 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -542,14 +542,17 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjustTZ, bool *symlink)
+static int cifs_query_path_info(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ struct cifs_open_info_data *data)
{
int rc;
FILE_ALL_INFO fi = {};
- *symlink = false;
+ data->symlink = false;
+ data->adjust_tz = false;
/* could do find first instead but this returns more info */
rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls,
@@ -562,7 +565,7 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls,
cifs_remap(cifs_sb));
- *adjustTZ = true;
+ data->adjust_tz = true;
}
if (!rc) {
@@ -589,7 +592,7 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
/* Need to check if this is a symbolic link or not */
tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
if (tmprc == -EOPNOTSUPP)
- *symlink = true;
+ data->symlink = true;
else if (tmprc == 0)
CIFSSMBClose(xid, tcon, fid.netfid);
}
@@ -969,13 +972,16 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
#endif
}
-static int
-cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- char **target_path, bool is_reparse_point)
+static int cifs_query_symlink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ char **target_path,
+ struct kvec *rsp_iov)
{
int rc;
int oplock = 0;
+ bool is_reparse_point = !!rsp_iov;
struct cifs_fid fid;
struct cifs_open_parms oparms;
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 8e696fbd72fa..0b89f7008ac0 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -35,34 +35,22 @@ free_set_inf_compound(struct smb_rqst *rqst)
SMB2_close_free(&rqst[2]);
}
-
-struct cop_vars {
- struct cifs_open_parms oparms;
- struct kvec rsp_iov[3];
- struct smb_rqst rqst[3];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
- struct kvec qi_iov[1];
- struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
- struct kvec close_iov[1];
- struct smb2_file_rename_info rename_info;
- struct smb2_file_link_info link_info;
-};
-
/*
* note: If cfile is passed, the reference to it is dropped here.
* So make sure that you do not reuse cfile after return from this func.
*
- * If passing @err_iov and @err_buftype, ensure to make them both large enough (>= 3) to hold all
- * error responses. Caller is also responsible for freeing them up.
+ * If passing @out_iov and @out_buftype, ensure to make them both large enough
+ * (>= 3) to hold all compounded responses. Caller is also responsible for
+ * freeing them up with free_rsp_buf().
*/
static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
__u32 desired_access, __u32 create_disposition, __u32 create_options,
umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
__u8 **extbuf, size_t *extbuflen,
- struct kvec *err_iov, int *err_buftype)
+ struct kvec *out_iov, int *out_buftype)
{
- struct cop_vars *vars = NULL;
+ struct smb2_compound_vars *vars = NULL;
struct kvec *rsp_iov;
struct smb_rqst *rqst;
int rc;
@@ -133,7 +121,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
/* Operation */
switch (command) {
case SMB2_OP_QUERY_INFO:
- rqst[num_rqst].rq_iov = &vars->qi_iov[0];
+ rqst[num_rqst].rq_iov = &vars->qi_iov;
rqst[num_rqst].rq_nvec = 1;
if (cfile)
@@ -167,7 +155,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
full_path);
break;
case SMB2_OP_POSIX_QUERY_INFO:
- rqst[num_rqst].rq_iov = &vars->qi_iov[0];
+ rqst[num_rqst].rq_iov = &vars->qi_iov;
rqst[num_rqst].rq_nvec = 1;
if (cfile)
@@ -375,7 +363,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
goto after_close;
/* Close */
flags |= CIFS_CP_CREATE_CLOSE_OP;
- rqst[num_rqst].rq_iov = &vars->close_iov[0];
+ rqst[num_rqst].rq_iov = &vars->close_iov;
rqst[num_rqst].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
&rqst[num_rqst], COMPOUND_FID,
@@ -529,9 +517,9 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
if (cfile)
cifsFileInfo_put(cfile);
- if (rc && err_iov && err_buftype) {
- memcpy(err_iov, rsp_iov, 3 * sizeof(*err_iov));
- memcpy(err_buftype, resp_buftype, 3 * sizeof(*err_buftype));
+ if (out_iov && out_buftype) {
+ memcpy(out_iov, rsp_iov, 3 * sizeof(*out_iov));
+ memcpy(out_buftype, resp_buftype, 3 * sizeof(*out_buftype));
} else {
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
@@ -541,20 +529,53 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
+static int parse_create_response(struct cifs_open_info_data *data,
+ struct cifs_sb_info *cifs_sb,
+ const struct kvec *iov)
+{
+ struct smb2_create_rsp *rsp = iov->iov_base;
+ bool reparse_point = false;
+ u32 tag = 0;
+ int rc = 0;
+
+ switch (rsp->hdr.Status) {
+ case STATUS_IO_REPARSE_TAG_NOT_HANDLED:
+ reparse_point = true;
+ break;
+ case STATUS_STOPPED_ON_SYMLINK:
+ rc = smb2_parse_symlink_response(cifs_sb, iov,
+ &data->symlink_target);
+ if (rc)
+ return rc;
+ tag = IO_REPARSE_TAG_SYMLINK;
+ reparse_point = true;
+ break;
+ case STATUS_SUCCESS:
+ reparse_point = !!(rsp->Flags & SMB2_CREATE_FLAG_REPARSEPOINT);
+ break;
+ }
+ data->reparse_point = reparse_point;
+ data->reparse_tag = tag;
+ return rc;
+}
+
+int smb2_query_path_info(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ struct cifs_open_info_data *data)
{
__u32 create_options = 0;
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
- struct kvec err_iov[3] = {};
- int err_buftype[3] = {};
+ struct smb2_hdr *hdr;
+ struct kvec out_iov[3] = {};
+ int out_buftype[3] = {};
bool islink;
int rc, rc2;
- *adjust_tz = false;
- *reparse = false;
+ data->adjust_tz = false;
+ data->reparse_point = false;
if (strcmp(full_path, ""))
rc = -ENOENT;
@@ -575,69 +596,73 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
- NULL, NULL, err_iov, err_buftype);
- if (rc) {
- struct smb2_hdr *hdr = err_iov[0].iov_base;
-
- if (unlikely(!hdr || err_buftype[0] == CIFS_NO_BUFFER))
+ NULL, NULL, out_iov, out_buftype);
+ hdr = out_iov[0].iov_base;
+ /*
+ * If first iov is unset, then SMB session was dropped or we've got a
+ * cached open file (@cfile).
+ */
+ if (!hdr || out_buftype[0] == CIFS_NO_BUFFER)
+ goto out;
+
+ switch (rc) {
+ case 0:
+ case -EOPNOTSUPP:
+ rc = parse_create_response(data, cifs_sb, &out_iov[0]);
+ if (rc || !data->reparse_point)
goto out;
- if (rc == -EOPNOTSUPP && hdr->Command == SMB2_CREATE &&
- hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
- rc = smb2_parse_symlink_response(cifs_sb, err_iov,
- &data->symlink_target);
- if (rc)
- goto out;
-
- *reparse = true;
- create_options |= OPEN_REPARSE_POINT;
-
- /* Failed on a symbolic link - query a reparse point info */
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, data,
- SMB2_OP_QUERY_INFO, cfile, NULL, NULL,
- NULL, NULL);
+
+ create_options |= OPEN_REPARSE_POINT;
+ /* Failed on a symbolic link - query a reparse point info */
+ cifs_get_readable_path(tcon, full_path, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, data,
+ SMB2_OP_QUERY_INFO, cfile, NULL, NULL,
+ NULL, NULL);
+ break;
+ case -EREMOTE:
+ break;
+ default:
+ if (hdr->Status != STATUS_OBJECT_NAME_INVALID)
+ break;
+ rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb,
+ full_path, &islink);
+ if (rc2) {
+ rc = rc2;
goto out;
- } else if (rc != -EREMOTE && hdr->Status == STATUS_OBJECT_NAME_INVALID) {
- rc2 = cifs_inval_name_dfs_link_error(xid, tcon, cifs_sb,
- full_path, &islink);
- if (rc2) {
- rc = rc2;
- goto out;
- }
- if (islink)
- rc = -EREMOTE;
}
+ if (islink)
+ rc = -EREMOTE;
}
out:
- free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
- free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
- free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
+ free_rsp_buf(out_buftype[0], out_iov[0].iov_base);
+ free_rsp_buf(out_buftype[1], out_iov[1].iov_base);
+ free_rsp_buf(out_buftype[2], out_iov[2].iov_base);
return rc;
}
-
-int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
+int smb311_posix_query_path_info(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
struct cifs_open_info_data *data,
struct cifs_sid *owner,
- struct cifs_sid *group,
- bool *adjust_tz, bool *reparse)
+ struct cifs_sid *group)
{
int rc;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
- struct kvec err_iov[3] = {};
- int err_buftype[3] = {};
+ struct kvec out_iov[3] = {};
+ int out_buftype[3] = {};
__u8 *sidsbuf = NULL;
__u8 *sidsbuf_end = NULL;
size_t sidsbuflen = 0;
size_t owner_len, group_len;
- *adjust_tz = false;
- *reparse = false;
+ data->adjust_tz = false;
+ data->reparse_point = false;
/*
* BB TODO: Add support for using the cached root handle.
@@ -649,27 +674,33 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
- &sidsbuf, &sidsbuflen, err_iov, err_buftype);
- if (rc == -EOPNOTSUPP) {
+ &sidsbuf, &sidsbuflen, out_iov, out_buftype);
+ /*
+ * If first iov is unset, then SMB session was dropped or we've got a
+ * cached open file (@cfile).
+ */
+ if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER)
+ goto out;
+
+ switch (rc) {
+ case 0:
+ case -EOPNOTSUPP:
/* BB TODO: When support for special files added to Samba re-verify this path */
- if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
- ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
- ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) {
- rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target);
- if (rc)
- goto out;
- }
- *reparse = true;
- create_options |= OPEN_REPARSE_POINT;
+ rc = parse_create_response(data, cifs_sb, &out_iov[0]);
+ if (rc || !data->reparse_point)
+ goto out;
+ create_options |= OPEN_REPARSE_POINT;
/* Failed on a symbolic link - query a reparse point info */
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
FILE_OPEN, create_options, ACL_NO_MODE, data,
SMB2_OP_POSIX_QUERY_INFO, cfile,
&sidsbuf, &sidsbuflen, NULL, NULL);
+ break;
}
+out:
if (rc == 0) {
sidsbuf_end = sidsbuf + sidsbuflen;
@@ -689,11 +720,10 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
memcpy(group, sidsbuf + owner_len, group_len);
}
-out:
kfree(sidsbuf);
- free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
- free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
- free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
+ free_rsp_buf(out_buftype[0], out_iov[0].iov_base);
+ free_rsp_buf(out_buftype[1], out_iov[1].iov_base);
+ free_rsp_buf(out_buftype[2], out_iov[2].iov_base);
return rc;
}
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 194799ddd382..1a90dd78b238 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -877,8 +877,6 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
"STATUS_IO_REPARSE_TAG_MISMATCH"},
{STATUS_IO_REPARSE_DATA_INVALID, -EIO,
"STATUS_IO_REPARSE_DATA_INVALID"},
- {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO,
- "STATUS_IO_REPARSE_TAG_NOT_HANDLED"},
{STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO,
"STATUS_REPARSE_POINT_NOT_RESOLVED"},
{STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO,
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 3935a60db5c3..25f7cd6f23d6 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -145,7 +145,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
__u64 mid;
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
/*
* Add function to do table lookup of StructureSize by command
@@ -623,7 +623,7 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "Checking for lease break\n");
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
@@ -698,7 +698,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
/* look up tcon based on tid & uid */
spin_lock(&cifs_tcp_ses_lock);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 0f62bc373ad0..9aeecee6b91b 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -172,8 +172,17 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val)
spin_lock(&server->req_lock);
server->credits = val;
- if (val == 1)
+ if (val == 1) {
server->reconnect_instance++;
+ /*
+ * ChannelSequence updated for all channels in primary channel so that consistent
+ * across SMB3 requests sent on any channel. See MS-SMB2 3.2.4.1 and 3.2.7.1
+ */
+ if (SERVER_IS_CHAN(server))
+ server->primary_server->channel_sequence_num++;
+ else
+ server->channel_sequence_num++;
+ }
scredits = server->credits;
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
@@ -288,7 +297,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)",
credits->value, new_val);
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
spin_lock(&server->req_lock);
@@ -1075,31 +1084,28 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-
static int
smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
const char *path, const char *ea_name, const void *ea_value,
const __u16 ea_value_len, const struct nls_table *nls_codepage,
struct cifs_sb_info *cifs_sb)
{
+ struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct smb_rqst *rqst;
+ struct kvec *rsp_iov;
__le16 *utf16_path = NULL;
int ea_name_len = strlen(ea_name);
int flags = CIFS_CP_CREATE_CLOSE_OP;
int len;
- struct smb_rqst rqst[3];
int resp_buftype[3];
- struct kvec rsp_iov[3];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
struct cifs_open_parms oparms;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_fid fid;
- struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
unsigned int size[1];
void *data[1];
struct smb2_file_full_ea_info *ea = NULL;
- struct kvec close_iov[1];
struct smb2_query_info_rsp *rsp;
int rc, used_len = 0;
@@ -1113,9 +1119,14 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
if (!utf16_path)
return -ENOMEM;
- memset(rqst, 0, sizeof(rqst));
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- memset(rsp_iov, 0, sizeof(rsp_iov));
+ vars = kzalloc(sizeof(*vars), GFP_KERNEL);
+ if (!vars) {
+ rc = -ENOMEM;
+ goto out_free_path;
+ }
+ rqst = vars->rqst;
+ rsp_iov = vars->rsp_iov;
if (ses->server->ops->query_all_EAs) {
if (!ea_value) {
@@ -1150,7 +1161,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
/* Use a fudge factor of 256 bytes in case we collide
* with a different set_EAs command.
*/
- if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ if (CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 <
used_len + ea_name_len + ea_value_len + 1) {
rc = -ENOSPC;
@@ -1160,8 +1171,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
}
/* Open */
- memset(&open_iov, 0, sizeof(open_iov));
- rqst[0].rq_iov = open_iov;
+ rqst[0].rq_iov = vars->open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
oparms = (struct cifs_open_parms) {
@@ -1181,8 +1191,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
/* Set Info */
- memset(&si_iov, 0, sizeof(si_iov));
- rqst[1].rq_iov = si_iov;
+ rqst[1].rq_iov = vars->si_iov;
rqst[1].rq_nvec = 1;
len = sizeof(*ea) + ea_name_len + ea_value_len + 1;
@@ -1210,10 +1219,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
-
/* Close */
- memset(&close_iov, 0, sizeof(close_iov));
- rqst[2].rq_iov = close_iov;
+ rqst[2].rq_iov = &vars->close_iov;
rqst[2].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
&rqst[2], COMPOUND_FID, COMPOUND_FID, false);
@@ -1228,13 +1235,15 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
sea_exit:
kfree(ea);
- kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_set_info_free(&rqst[1]);
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ kfree(vars);
+out_free_path:
+ kfree(utf16_path);
return rc;
}
#endif
@@ -1396,7 +1405,8 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
if (file_inf.LastWriteTime)
inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime);
if (file_inf.ChangeTime)
- inode->i_ctime = cifs_NTtimeToUnix(file_inf.ChangeTime);
+ inode_set_ctime_to_ts(inode,
+ cifs_NTtimeToUnix(file_inf.ChangeTime));
if (file_inf.LastAccessTime)
inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime);
@@ -1445,16 +1455,6 @@ req_res_key_exit:
return rc;
}
-struct iqi_vars {
- struct smb_rqst rqst[3];
- struct kvec rsp_iov[3];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
- struct kvec qi_iov[1];
- struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
- struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
- struct kvec close_iov[1];
-};
-
static int
smb2_ioctl_query_info(const unsigned int xid,
struct cifs_tcon *tcon,
@@ -1462,7 +1462,7 @@ smb2_ioctl_query_info(const unsigned int xid,
__le16 *path, int is_dir,
unsigned long p)
{
- struct iqi_vars *vars;
+ struct smb2_compound_vars *vars;
struct smb_rqst *rqst;
struct kvec *rsp_iov;
struct cifs_ses *ses = tcon->ses;
@@ -1580,7 +1580,7 @@ smb2_ioctl_query_info(const unsigned int xid,
rc = -EINVAL;
goto free_open_req;
}
- rqst[1].rq_iov = &vars->si_iov[0];
+ rqst[1].rq_iov = vars->si_iov;
rqst[1].rq_nvec = 1;
/* MS-FSCC 2.4.13 FileEndOfFileInformation */
@@ -1592,7 +1592,7 @@ smb2_ioctl_query_info(const unsigned int xid,
SMB2_O_INFO_FILE, 0, data, size);
free_req1_func = SMB2_set_info_free;
} else if (qi.flags == PASSTHRU_QUERY_INFO) {
- rqst[1].rq_iov = &vars->qi_iov[0];
+ rqst[1].rq_iov = &vars->qi_iov;
rqst[1].rq_nvec = 1;
rc = SMB2_query_info_init(tcon, server,
@@ -1614,7 +1614,7 @@ smb2_ioctl_query_info(const unsigned int xid,
smb2_set_related(&rqst[1]);
/* Close */
- rqst[2].rq_iov = &vars->close_iov[0];
+ rqst[2].rq_iov = &vars->close_iov;
rqst[2].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
@@ -2407,7 +2407,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
return false;
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
@@ -2523,15 +2523,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb)
{
+ struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = cifs_pick_channel(ses);
int flags = CIFS_CP_CREATE_CLOSE_OP;
- struct smb_rqst rqst[3];
+ struct smb_rqst *rqst;
int resp_buftype[3];
- struct kvec rsp_iov[3];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
- struct kvec qi_iov[1];
- struct kvec close_iov[1];
+ struct kvec *rsp_iov;
u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct cifs_open_parms oparms;
struct cifs_fid fid;
@@ -2548,9 +2546,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- memset(rqst, 0, sizeof(rqst));
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- memset(rsp_iov, 0, sizeof(rsp_iov));
+ vars = kzalloc(sizeof(*vars), GFP_KERNEL);
+ if (!vars) {
+ rc = -ENOMEM;
+ goto out_free_path;
+ }
+ rqst = vars->rqst;
+ rsp_iov = vars->rsp_iov;
/*
* We can only call this for things we know are directories.
@@ -2559,8 +2562,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
open_cached_dir(xid, tcon, path, cifs_sb, false,
&cfid); /* cfid null if open dir failed */
- memset(&open_iov, 0, sizeof(open_iov));
- rqst[0].rq_iov = open_iov;
+ rqst[0].rq_iov = vars->open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
oparms = (struct cifs_open_parms) {
@@ -2578,8 +2580,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
goto qic_exit;
smb2_set_next_command(tcon, &rqst[0]);
- memset(&qi_iov, 0, sizeof(qi_iov));
- rqst[1].rq_iov = qi_iov;
+ rqst[1].rq_iov = &vars->qi_iov;
rqst[1].rq_nvec = 1;
if (cfid) {
@@ -2606,8 +2607,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
smb2_set_related(&rqst[1]);
}
- memset(&close_iov, 0, sizeof(close_iov));
- rqst[2].rq_iov = close_iov;
+ rqst[2].rq_iov = &vars->close_iov;
rqst[2].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
@@ -2638,7 +2638,6 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
*buftype = resp_buftype[1];
qic_exit:
- kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_query_info_free(&rqst[1]);
SMB2_close_free(&rqst[2]);
@@ -2646,6 +2645,9 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
if (cfid)
close_cached_dir(cfid);
+ kfree(vars);
+out_free_path:
+ kfree(utf16_path);
return rc;
}
@@ -2681,6 +2683,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
smb2_copy_fs_info_to_kstatfs(info, buf);
qfs_exit:
+ trace_smb3_qfs_done(xid, tcon->tid, tcon->ses->Suid, tcon->tree_name, rc);
free_rsp_buf(buftype, rsp_iov.iov_base);
return rc;
}
@@ -2948,154 +2951,32 @@ parse_reparse_point(struct reparse_data_buffer *buf,
}
}
-static int
-smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- char **target_path, bool is_reparse_point)
+static int smb2_query_symlink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ char **target_path,
+ struct kvec *rsp_iov)
{
- int rc;
- __le16 *utf16_path = NULL;
- __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
- struct cifs_open_parms oparms;
- struct cifs_fid fid;
- struct kvec err_iov = {NULL, 0};
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- int flags = CIFS_CP_CREATE_CLOSE_OP;
- struct smb_rqst rqst[3];
- int resp_buftype[3];
- struct kvec rsp_iov[3];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
- struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
- struct kvec close_iov[1];
- struct smb2_create_rsp *create_rsp;
- struct smb2_ioctl_rsp *ioctl_rsp;
- struct reparse_data_buffer *reparse_buf;
- int create_options = is_reparse_point ? OPEN_REPARSE_POINT : 0;
- u32 plen;
+ struct reparse_data_buffer *buf;
+ struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
+ u32 plen = le32_to_cpu(io->OutputCount);
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
- *target_path = NULL;
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- memset(rqst, 0, sizeof(rqst));
- resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- memset(rsp_iov, 0, sizeof(rsp_iov));
-
- utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
-
- /* Open */
- memset(&open_iov, 0, sizeof(open_iov));
- rqst[0].rq_iov = open_iov;
- rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
-
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .path = full_path,
- .desired_access = FILE_READ_ATTRIBUTES,
- .disposition = FILE_OPEN,
- .create_options = cifs_create_options(cifs_sb, create_options),
- .fid = &fid,
- };
-
- rc = SMB2_open_init(tcon, server,
- &rqst[0], &oplock, &oparms, utf16_path);
- if (rc)
- goto querty_exit;
- smb2_set_next_command(tcon, &rqst[0]);
-
-
- /* IOCTL */
- memset(&io_iov, 0, sizeof(io_iov));
- rqst[1].rq_iov = io_iov;
- rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
-
- rc = SMB2_ioctl_init(tcon, server,
- &rqst[1], fid.persistent_fid,
- fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0,
- CIFSMaxBufSize -
- MAX_SMB2_CREATE_RESPONSE_SIZE -
- MAX_SMB2_CLOSE_RESPONSE_SIZE);
- if (rc)
- goto querty_exit;
-
- smb2_set_next_command(tcon, &rqst[1]);
- smb2_set_related(&rqst[1]);
-
-
- /* Close */
- memset(&close_iov, 0, sizeof(close_iov));
- rqst[2].rq_iov = close_iov;
- rqst[2].rq_nvec = 1;
-
- rc = SMB2_close_init(tcon, server,
- &rqst[2], COMPOUND_FID, COMPOUND_FID, false);
- if (rc)
- goto querty_exit;
-
- smb2_set_related(&rqst[2]);
-
- rc = compound_send_recv(xid, tcon->ses, server,
- flags, 3, rqst,
- resp_buftype, rsp_iov);
-
- create_rsp = rsp_iov[0].iov_base;
- if (create_rsp && create_rsp->hdr.Status)
- err_iov = rsp_iov[0];
- ioctl_rsp = rsp_iov[1].iov_base;
-
- /*
- * Open was successful and we got an ioctl response.
- */
- if ((rc == 0) && (is_reparse_point)) {
- /* See MS-FSCC 2.3.23 */
-
- reparse_buf = (struct reparse_data_buffer *)
- ((char *)ioctl_rsp +
- le32_to_cpu(ioctl_rsp->OutputOffset));
- plen = le32_to_cpu(ioctl_rsp->OutputCount);
-
- if (plen + le32_to_cpu(ioctl_rsp->OutputOffset) >
- rsp_iov[1].iov_len) {
- cifs_tcon_dbg(VFS, "srv returned invalid ioctl len: %d\n",
- plen);
- rc = -EIO;
- goto querty_exit;
- }
-
- rc = parse_reparse_point(reparse_buf, plen, target_path,
- cifs_sb);
- goto querty_exit;
- }
-
- if (!rc || !err_iov.iov_base) {
- rc = -ENOENT;
- goto querty_exit;
- }
-
- rc = smb2_parse_symlink_response(cifs_sb, &err_iov, target_path);
-
- querty_exit:
- cifs_dbg(FYI, "query symlink rc %d\n", rc);
- kfree(utf16_path);
- SMB2_open_free(&rqst[0]);
- SMB2_ioctl_free(&rqst[1]);
- SMB2_close_free(&rqst[2]);
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
- return rc;
+ buf = (struct reparse_data_buffer *)((u8 *)io +
+ le32_to_cpu(io->OutputOffset));
+ return parse_reparse_point(buf, plen, target_path, cifs_sb);
}
-int
-smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 *tag)
+static int smb2_query_reparse_point(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype)
{
+ struct smb2_compound_vars *vars;
int rc;
__le16 *utf16_path = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -3103,12 +2984,9 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
int flags = CIFS_CP_CREATE_CLOSE_OP;
- struct smb_rqst rqst[3];
+ struct smb_rqst *rqst;
int resp_buftype[3];
- struct kvec rsp_iov[3];
- struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
- struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
- struct kvec close_iov[1];
+ struct kvec *rsp_iov;
struct smb2_ioctl_rsp *ioctl_rsp;
struct reparse_data_buffer *reparse_buf;
u32 plen;
@@ -3118,20 +2996,24 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- memset(rqst, 0, sizeof(rqst));
- resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- memset(rsp_iov, 0, sizeof(rsp_iov));
-
utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
+ resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ vars = kzalloc(sizeof(*vars), GFP_KERNEL);
+ if (!vars) {
+ rc = -ENOMEM;
+ goto out_free_path;
+ }
+ rqst = vars->rqst;
+ rsp_iov = vars->rsp_iov;
+
/*
* setup smb2open - TODO add optimization to call cifs_get_readable_path
* to see if there is a handle already open that we can use
*/
- memset(&open_iov, 0, sizeof(open_iov));
- rqst[0].rq_iov = open_iov;
+ rqst[0].rq_iov = vars->open_iov;
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
oparms = (struct cifs_open_parms) {
@@ -3151,8 +3033,7 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
/* IOCTL */
- memset(&io_iov, 0, sizeof(io_iov));
- rqst[1].rq_iov = io_iov;
+ rqst[1].rq_iov = vars->io_iov;
rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
rc = SMB2_ioctl_init(tcon, server,
@@ -3167,10 +3048,8 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
-
/* Close */
- memset(&close_iov, 0, sizeof(close_iov));
- rqst[2].rq_iov = close_iov;
+ rqst[2].rq_iov = &vars->close_iov;
rqst[2].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
@@ -3205,16 +3084,21 @@ smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
goto query_rp_exit;
}
*tag = le32_to_cpu(reparse_buf->ReparseTag);
+ *rsp = rsp_iov[1];
+ *rsp_buftype = resp_buftype[1];
+ resp_buftype[1] = CIFS_NO_BUFFER;
}
query_rp_exit:
- kfree(utf16_path);
SMB2_open_free(&rqst[0]);
SMB2_ioctl_free(&rqst[1]);
SMB2_close_free(&rqst[2]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ kfree(vars);
+out_free_path:
+ kfree(utf16_path);
return rc;
}
@@ -4400,7 +4284,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key)
u8 *ses_enc_key;
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
@@ -4707,7 +4591,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
if (shdr->Command != SMB2_READ) {
cifs_server_dbg(VFS, "only big read responses are supported\n");
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
}
if (server->ops->is_session_expired &&
@@ -5298,6 +5182,7 @@ struct smb_version_operations smb20_operations = {
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
.query_path_info = smb2_query_path_info,
+ .query_reparse_point = smb2_query_reparse_point,
.get_srv_inum = smb2_get_srv_inum,
.query_file_info = smb2_query_file_info,
.set_path_size = smb2_set_path_size,
@@ -5399,6 +5284,7 @@ struct smb_version_operations smb21_operations = {
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
.query_path_info = smb2_query_path_info,
+ .query_reparse_point = smb2_query_reparse_point,
.get_srv_inum = smb2_get_srv_inum,
.query_file_info = smb2_query_file_info,
.set_path_size = smb2_set_path_size,
@@ -5503,7 +5389,7 @@ struct smb_version_operations smb30_operations = {
.echo = SMB2_echo,
.query_path_info = smb2_query_path_info,
/* WSL tags introduced long after smb2.1, enable for SMB3, 3.11 only */
- .query_reparse_tag = smb2_query_reparse_tag,
+ .query_reparse_point = smb2_query_reparse_point,
.get_srv_inum = smb2_get_srv_inum,
.query_file_info = smb2_query_file_info,
.set_path_size = smb2_set_path_size,
@@ -5616,7 +5502,7 @@ struct smb_version_operations smb311_operations = {
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
.query_path_info = smb2_query_path_info,
- .query_reparse_tag = smb2_query_reparse_tag,
+ .query_reparse_point = smb2_query_reparse_point,
.get_srv_inum = smb2_get_srv_inum,
.query_file_info = smb2_query_file_info,
.set_path_size = smb2_set_path_size,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index a457f07f820d..c75a80bb6d9e 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -88,10 +88,27 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd,
const struct cifs_tcon *tcon,
struct TCP_Server_Info *server)
{
+ struct smb3_hdr_req *smb3_hdr;
+
shdr->ProtocolId = SMB2_PROTO_NUMBER;
shdr->StructureSize = cpu_to_le16(64);
shdr->Command = smb2_cmd;
+
if (server) {
+ /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */
+ if (server->dialect >= SMB30_PROT_ID) {
+ smb3_hdr = (struct smb3_hdr_req *)shdr;
+ /*
+ * if primary channel is not set yet, use default
+ * channel for chan sequence num
+ */
+ if (SERVER_IS_CHAN(server))
+ smb3_hdr->ChannelSequence =
+ cpu_to_le16(server->primary_server->channel_sequence_num);
+ else
+ smb3_hdr->ChannelSequence =
+ cpu_to_le16(server->channel_sequence_num);
+ }
spin_lock(&server->req_lock);
/* Request up to 10 credits but don't go over the limit. */
if (server->credits >= server->max_credits)
@@ -553,7 +570,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
* secondary channels don't have the hostname field populated
* use the hostname field in the primary channel instead
*/
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
cifs_server_lock(pserver);
hostname = pserver->hostname;
if (hostname && (hostname[0] != 0)) {
@@ -831,7 +848,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode)
iov[num].iov_base = create_posix_buf(mode);
if (mode == ACL_NO_MODE)
- cifs_dbg(FYI, "Invalid mode\n");
+ cifs_dbg(FYI, "%s: no mode\n", __func__);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = sizeof(struct create_posix);
@@ -2223,7 +2240,7 @@ create_durable_v2_buf(struct cifs_open_parms *oparms)
* (most servers default to 120 seconds) and most clients default to 0.
* This can be overridden at mount ("handletimeout=") if the user wants
* a different persistent (or resilient) handle timeout for all opens
- * opens on a particular SMB3 mount.
+ * on a particular SMB3 mount.
*/
buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
@@ -2368,7 +2385,7 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp)
return 0;
}
-/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */
+/* See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */
static void setup_owner_group_sids(char *buf)
{
struct owner_group_sids *sids = (struct owner_group_sids *)buf;
@@ -2570,8 +2587,8 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
/* Do not append the separator if the path is empty */
if (path[0] != cpu_to_le16(0x0000)) {
- UniStrcat(*out_path, sep);
- UniStrcat(*out_path, path);
+ UniStrcat((wchar_t *)*out_path, (wchar_t *)sep);
+ UniStrcat((wchar_t *)*out_path, (wchar_t *)path);
}
unload_nls(cp);
@@ -3113,6 +3130,7 @@ void
SMB2_ioctl_free(struct smb_rqst *rqst)
{
int i;
+
if (rqst && rqst->rq_iov) {
cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */
for (i = 1; i < rqst->rq_nvec; i++)
@@ -3785,7 +3803,7 @@ void smb2_reconnect_server(struct work_struct *work)
bool resched = false;
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
/* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
mutex_lock(&pserver->reconnect_mutex);
@@ -3860,7 +3878,7 @@ void smb2_reconnect_server(struct work_struct *work)
goto done;
/* allocate a dummy tcon struct used for reconnect */
- tcon = tconInfoAlloc();
+ tcon = tcon_info_alloc(false);
if (!tcon) {
resched = true;
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index d5d7ffb7711c..46eff9ec302a 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -56,9 +56,11 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
-int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
+int smb2_query_path_info(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ struct cifs_open_info_data *data);
extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
struct cifs_sb_info *cifs_sb, bool set_alloc);
@@ -275,12 +277,13 @@ extern int smb2_query_info_compound(const unsigned int xid,
struct kvec *rsp, int *buftype,
struct cifs_sb_info *cifs_sb);
/* query path info from the server using SMB311 POSIX extensions*/
-int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, const char *full_path,
+int smb311_posix_query_path_info(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
struct cifs_open_info_data *data,
struct cifs_sid *owner,
- struct cifs_sid *group,
- bool *adjust_tz, bool *reparse);
+ struct cifs_sid *group);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 7676091b3e77..23c50ed7d4b5 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -86,7 +86,7 @@ int smb2_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key)
spin_lock(&cifs_tcp_ses_lock);
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
if (ses->Suid == ses_id)
@@ -149,7 +149,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id)
struct cifs_ses *ses;
/* If server is a channel, select the primary channel */
- pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
if (ses->Suid != ses_id)
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 2a2aec8c6112..94df9eec3d8d 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -1401,10 +1401,13 @@ create_conn:
server->smbd_conn = smbd_get_connection(
server, (struct sockaddr *) &server->dstaddr);
- if (server->smbd_conn)
+ if (server->smbd_conn) {
cifs_dbg(VFS, "RDMA transport re-established\n");
-
- return server->smbd_conn ? 0 : -ENOENT;
+ trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr);
+ return 0;
+ }
+ trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr);
+ return -ENOENT;
}
static void destroy_caches_and_workqueue(struct smbd_connection *info)
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index e671bd16f00c..de199ec9f726 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -691,7 +691,7 @@ DEFINE_EVENT(smb3_tcon_class, smb3_##name, \
TP_ARGS(xid, tid, sesid, unc_name, rc))
DEFINE_SMB3_TCON_EVENT(tcon);
-
+DEFINE_SMB3_TCON_EVENT(qfs_done);
/*
* For smb2/smb3 open (including create and mkdir) calls
@@ -935,6 +935,8 @@ DEFINE_EVENT(smb3_connect_class, smb3_##name, \
TP_ARGS(hostname, conn_id, addr))
DEFINE_SMB3_CONNECT_EVENT(connect_done);
+DEFINE_SMB3_CONNECT_EVENT(smbd_connect_done);
+DEFINE_SMB3_CONNECT_EVENT(smbd_connect_err);
DECLARE_EVENT_CLASS(smb3_connect_err_class,
TP_PROTO(char *hostname, __u64 conn_id,
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index f280502a2aee..14710afdc2a3 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -18,7 +18,7 @@
#include <linux/bvec.h>
#include <linux/highmem.h>
#include <linux/uaccess.h>
-#include <asm/processor.h>
+#include <linux/processor.h>
#include <linux/mempool.h>
#include <linux/sched/signal.h>
#include <linux/task_io_accounting_ops.h>
@@ -35,6 +35,8 @@
void
cifs_wake_up_task(struct mid_q_entry *mid)
{
+ if (mid->mid_state == MID_RESPONSE_RECEIVED)
+ mid->mid_state = MID_RESPONSE_READY;
wake_up_process(mid->callback_data);
}
@@ -87,7 +89,8 @@ static void __release_mid(struct kref *refcount)
struct TCP_Server_Info *server = midEntry->server;
if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
- midEntry->mid_state == MID_RESPONSE_RECEIVED &&
+ (midEntry->mid_state == MID_RESPONSE_RECEIVED ||
+ midEntry->mid_state == MID_RESPONSE_READY) &&
server->ops->handle_cancelled_mid)
server->ops->handle_cancelled_mid(midEntry, server);
@@ -416,13 +419,19 @@ out:
return rc;
}
+struct send_req_vars {
+ struct smb2_transform_hdr tr_hdr;
+ struct smb_rqst rqst[MAX_COMPOUND];
+ struct kvec iov;
+};
+
static int
smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *rqst, int flags)
{
- struct kvec iov;
- struct smb2_transform_hdr *tr_hdr;
- struct smb_rqst cur_rqst[MAX_COMPOUND];
+ struct send_req_vars *vars;
+ struct smb_rqst *cur_rqst;
+ struct kvec *iov;
int rc;
if (!(flags & CIFS_TRANSFORM_REQ))
@@ -436,16 +445,15 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
return -EIO;
}
- tr_hdr = kzalloc(sizeof(*tr_hdr), GFP_NOFS);
- if (!tr_hdr)
+ vars = kzalloc(sizeof(*vars), GFP_NOFS);
+ if (!vars)
return -ENOMEM;
+ cur_rqst = vars->rqst;
+ iov = &vars->iov;
- memset(&cur_rqst[0], 0, sizeof(cur_rqst));
- memset(&iov, 0, sizeof(iov));
-
- iov.iov_base = tr_hdr;
- iov.iov_len = sizeof(*tr_hdr);
- cur_rqst[0].rq_iov = &iov;
+ iov->iov_base = &vars->tr_hdr;
+ iov->iov_len = sizeof(vars->tr_hdr);
+ cur_rqst[0].rq_iov = iov;
cur_rqst[0].rq_nvec = 1;
rc = server->ops->init_transform_rq(server, num_rqst + 1,
@@ -456,7 +464,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]);
smb3_free_compound_rqst(num_rqst, &cur_rqst[1]);
out:
- kfree(tr_hdr);
+ kfree(vars);
return rc;
}
@@ -732,7 +740,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
int error;
error = wait_event_state(server->response_q,
- midQ->mid_state != MID_REQUEST_SUBMITTED,
+ midQ->mid_state != MID_REQUEST_SUBMITTED &&
+ midQ->mid_state != MID_RESPONSE_RECEIVED,
(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE));
if (error < 0)
return -ERESTARTSYS;
@@ -885,7 +894,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
spin_lock(&server->mid_lock);
switch (mid->mid_state) {
- case MID_RESPONSE_RECEIVED:
+ case MID_RESPONSE_READY:
spin_unlock(&server->mid_lock);
return rc;
case MID_RETRY_NEEDED:
@@ -984,6 +993,9 @@ cifs_compound_callback(struct mid_q_entry *mid)
credits.instance = server->reconnect_instance;
add_credits(server, &credits, mid->optype);
+
+ if (mid->mid_state == MID_RESPONSE_RECEIVED)
+ mid->mid_state = MID_RESPONSE_READY;
}
static void
@@ -1204,7 +1216,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
send_cancel(server, &rqst[i], midQ[i]);
spin_lock(&server->mid_lock);
midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
- if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ[i]->mid_state == MID_RESPONSE_RECEIVED) {
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
@@ -1225,7 +1238,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
if (!midQ[i]->resp_buf ||
- midQ[i]->mid_state != MID_RESPONSE_RECEIVED) {
+ midQ[i]->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_dbg(FYI, "Bad MID state?\n");
goto out;
@@ -1412,7 +1425,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
if (rc != 0) {
send_cancel(server, &rqst, midQ);
spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
spin_unlock(&server->mid_lock);
@@ -1429,7 +1443,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
}
if (!midQ->resp_buf || !out_buf ||
- midQ->mid_state != MID_RESPONSE_RECEIVED) {
+ midQ->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_server_dbg(VFS, "Bad MID state?\n");
goto out;
@@ -1553,14 +1567,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
/* Wait for a reply - allow signals to interrupt. */
rc = wait_event_interruptible(server->response_q,
- (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) ||
+ (!(midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED)) ||
((server->tcpStatus != CifsGood) &&
(server->tcpStatus != CifsNew)));
/* Were we interrupted by a signal ? */
spin_lock(&server->srv_lock);
if ((rc == -ERESTARTSYS) &&
- (midQ->mid_state == MID_REQUEST_SUBMITTED) &&
+ (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) &&
((server->tcpStatus == CifsGood) ||
(server->tcpStatus == CifsNew))) {
spin_unlock(&server->srv_lock);
@@ -1591,7 +1607,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
if (rc) {
send_cancel(server, &rqst, midQ);
spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
+ if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) {
/* no longer considered to be "in-flight" */
midQ->callback = release_mid;
spin_unlock(&server->mid_lock);
@@ -1611,7 +1628,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
/* rcvd frame is ok */
- if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) {
+ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) {
rc = -EIO;
cifs_tcon_dbg(VFS, "Bad MID state?\n");
goto out;
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index bae590eec871..319fb9ffc6a0 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -153,6 +153,28 @@ struct smb2_hdr {
__u8 Signature[16];
} __packed;
+struct smb3_hdr_req {
+ __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */
+ __le16 StructureSize; /* 64 */
+ __le16 CreditCharge; /* MBZ */
+ __le16 ChannelSequence; /* See MS-SMB2 3.2.4.1 and 3.2.7.1 */
+ __le16 Reserved;
+ __le16 Command;
+ __le16 CreditRequest; /* CreditResponse */
+ __le32 Flags;
+ __le32 NextCommand;
+ __le64 MessageId;
+ union {
+ struct {
+ __le32 ProcessId;
+ __le32 TreeId;
+ } __packed SyncId;
+ __le64 AsyncId;
+ } __packed Id;
+ __le64 SessionId;
+ __u8 Signature[16];
+} __packed;
+
struct smb2_pdu {
struct smb2_hdr hdr;
__le16 StructureSize2; /* size of wct area (varies, request specific) */
@@ -384,7 +406,7 @@ struct smb2_tree_disconnect_rsp {
/* Capabilities flags */
#define SMB2_GLOBAL_CAP_DFS 0x00000001
#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */
-#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */
+#define SMB2_GLOBAL_CAP_LARGE_MTU 0x00000004 /* Resp only New to SMB2.1 */
#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */
#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */
#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */
diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig
index 7055cb5d2880..cabe6a843c6a 100644
--- a/fs/smb/server/Kconfig
+++ b/fs/smb/server/Kconfig
@@ -1,10 +1,11 @@
config SMB_SERVER
- tristate "SMB3 server support (EXPERIMENTAL)"
+ tristate "SMB3 server support"
depends on INET
depends on MULTIUSER
depends on FILE_LOCKING
select NLS
select NLS_UTF8
+ select NLS_UCS2_UTILS
select CRYPTO
select CRYPTO_MD5
select CRYPTO_HMAC
diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c
index cc6384f79675..4a4b2b03ff33 100644
--- a/fs/smb/server/asn1.c
+++ b/fs/smb/server/asn1.c
@@ -214,12 +214,10 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen,
{
struct ksmbd_conn *conn = context;
- conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL);
+ conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL);
if (!conn->mechToken)
return -ENOMEM;
- memcpy(conn->mechToken, value, vlen);
- conn->mechToken[vlen] = '\0';
return 0;
}
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 5e5e120edcc2..229a6527870d 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -355,6 +355,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
if (blob_len < (u64)sess_key_off + sess_key_len)
return -EINVAL;
+ if (sess_key_len > CIFS_KEY_SIZE)
+ return -EINVAL;
+
ctx_arc4 = kmalloc(sizeof(*ctx_arc4), GFP_KERNEL);
if (!ctx_arc4)
return -ENOMEM;
@@ -1029,11 +1032,15 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
{
struct scatterlist *sg;
unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 20;
- int i, nr_entries[3] = {0}, total_entries = 0, sg_idx = 0;
+ int i, *nr_entries, total_entries = 0, sg_idx = 0;
if (!nvec)
return NULL;
+ nr_entries = kcalloc(nvec, sizeof(int), GFP_KERNEL);
+ if (!nr_entries)
+ return NULL;
+
for (i = 0; i < nvec - 1; i++) {
unsigned long kaddr = (unsigned long)iov[i + 1].iov_base;
@@ -1051,8 +1058,10 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
total_entries += 2;
sg = kmalloc_array(total_entries, sizeof(struct scatterlist), GFP_KERNEL);
- if (!sg)
+ if (!sg) {
+ kfree(nr_entries);
return NULL;
+ }
sg_init_table(sg, total_entries);
smb2_sg_set_buf(&sg[sg_idx++], iov[0].iov_base + 24, assoc_data_len);
@@ -1086,6 +1095,7 @@ static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec,
}
}
smb2_sg_set_buf(&sg[sg_idx], sign, SMB2_SIGNATURE_SIZE);
+ kfree(nr_entries);
return sg;
}
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index 2a717d158f02..4b38c3a285f6 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -84,6 +84,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
spin_lock_init(&conn->llist_lock);
INIT_LIST_HEAD(&conn->lock_list);
+ init_rwsem(&conn->session_lock);
+
down_write(&conn_list_lock);
list_add(&conn->conns_list, &conn_list);
up_write(&conn_list_lock);
@@ -123,28 +125,22 @@ void ksmbd_conn_enqueue_request(struct ksmbd_work *work)
}
}
-int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work)
+void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- int ret = 1;
if (list_empty(&work->request_entry) &&
list_empty(&work->async_request_entry))
- return 0;
+ return;
- if (!work->multiRsp)
- atomic_dec(&conn->req_running);
- if (!work->multiRsp) {
- spin_lock(&conn->request_lock);
- list_del_init(&work->request_entry);
- spin_unlock(&conn->request_lock);
- if (work->asynchronous)
- release_async_work(work);
- ret = 0;
- }
+ atomic_dec(&conn->req_running);
+ spin_lock(&conn->request_lock);
+ list_del_init(&work->request_entry);
+ spin_unlock(&conn->request_lock);
+ if (work->asynchronous)
+ release_async_work(work);
wake_up_all(&conn->req_running_q);
- return ret;
}
void ksmbd_conn_lock(struct ksmbd_conn *conn)
@@ -193,41 +189,25 @@ void ksmbd_conn_wait_idle(struct ksmbd_conn *conn, u64 sess_id)
int ksmbd_conn_write(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- size_t len = 0;
int sent;
- struct kvec iov[3];
- int iov_idx = 0;
if (!work->response_buf) {
pr_err("NULL response header\n");
return -EINVAL;
}
- if (work->tr_buf) {
- iov[iov_idx] = (struct kvec) { work->tr_buf,
- sizeof(struct smb2_transform_hdr) + 4 };
- len += iov[iov_idx++].iov_len;
- }
+ if (work->send_no_response)
+ return 0;
- if (work->aux_payload_sz) {
- iov[iov_idx] = (struct kvec) { work->response_buf, work->resp_hdr_sz };
- len += iov[iov_idx++].iov_len;
- iov[iov_idx] = (struct kvec) { work->aux_payload_buf, work->aux_payload_sz };
- len += iov[iov_idx++].iov_len;
- } else {
- if (work->tr_buf)
- iov[iov_idx].iov_len = work->resp_hdr_sz;
- else
- iov[iov_idx].iov_len = get_rfc1002_len(work->response_buf) + 4;
- iov[iov_idx].iov_base = work->response_buf;
- len += iov[iov_idx++].iov_len;
- }
+ if (!work->iov_idx)
+ return -EINVAL;
ksmbd_conn_lock(conn);
- sent = conn->transport->ops->writev(conn->transport, &iov[0],
- iov_idx, len,
- work->need_invalidate_rkey,
- work->remote_key);
+ sent = conn->transport->ops->writev(conn->transport, work->iov,
+ work->iov_cnt,
+ get_rfc1002_len(work->iov[0].iov_base) + 4,
+ work->need_invalidate_rkey,
+ work->remote_key);
ksmbd_conn_unlock(conn);
if (sent < 0) {
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index ad8dfaa48ffb..3c005246a32e 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -50,6 +50,7 @@ struct ksmbd_conn {
struct nls_table *local_nls;
struct unicode_map *um;
struct list_head conns_list;
+ struct rw_semaphore session_lock;
/* smb session 1 per user */
struct xarray sessions;
unsigned long last_active;
@@ -158,7 +159,7 @@ int ksmbd_conn_rdma_write(struct ksmbd_conn *conn,
struct smb2_buffer_desc_v1 *desc,
unsigned int desc_len);
void ksmbd_conn_enqueue_request(struct ksmbd_work *work);
-int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
+void ksmbd_conn_try_dequeue_request(struct ksmbd_work *work);
void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops);
int ksmbd_conn_handler_loop(void *p);
int ksmbd_conn_transport_init(void);
diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c
index 14b9caebf7a4..51def3ca74c0 100644
--- a/fs/smb/server/ksmbd_work.c
+++ b/fs/smb/server/ksmbd_work.c
@@ -27,18 +27,35 @@ struct ksmbd_work *ksmbd_alloc_work_struct(void)
INIT_LIST_HEAD(&work->async_request_entry);
INIT_LIST_HEAD(&work->fp_entry);
INIT_LIST_HEAD(&work->interim_entry);
+ INIT_LIST_HEAD(&work->aux_read_list);
+ work->iov_alloc_cnt = 4;
+ work->iov = kcalloc(work->iov_alloc_cnt, sizeof(struct kvec),
+ GFP_KERNEL);
+ if (!work->iov) {
+ kmem_cache_free(work_cache, work);
+ work = NULL;
+ }
}
return work;
}
void ksmbd_free_work_struct(struct ksmbd_work *work)
{
+ struct aux_read *ar, *tmp;
+
WARN_ON(work->saved_cred != NULL);
kvfree(work->response_buf);
- kvfree(work->aux_payload_buf);
+
+ list_for_each_entry_safe(ar, tmp, &work->aux_read_list, entry) {
+ kvfree(ar->buf);
+ list_del(&ar->entry);
+ kfree(ar);
+ }
+
kfree(work->tr_buf);
kvfree(work->request_buf);
+ kfree(work->iov);
if (work->async_id)
ksmbd_release_id(&work->conn->async_ida, work->async_id);
kmem_cache_free(work_cache, work);
@@ -77,3 +94,77 @@ bool ksmbd_queue_work(struct ksmbd_work *work)
{
return queue_work(ksmbd_wq, &work->work);
}
+
+static int ksmbd_realloc_iov_pin(struct ksmbd_work *work, void *ib,
+ unsigned int ib_len)
+{
+
+ if (work->iov_alloc_cnt <= work->iov_cnt) {
+ struct kvec *new;
+
+ work->iov_alloc_cnt += 4;
+ new = krealloc(work->iov,
+ sizeof(struct kvec) * work->iov_alloc_cnt,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!new)
+ return -ENOMEM;
+ work->iov = new;
+ }
+
+ work->iov[++work->iov_idx].iov_base = ib;
+ work->iov[work->iov_idx].iov_len = ib_len;
+ work->iov_cnt++;
+
+ return 0;
+}
+
+static int __ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len,
+ void *aux_buf, unsigned int aux_size)
+{
+ /* Plus rfc_length size on first iov */
+ if (!work->iov_idx) {
+ work->iov[work->iov_idx].iov_base = work->response_buf;
+ *(__be32 *)work->iov[0].iov_base = 0;
+ work->iov[work->iov_idx].iov_len = 4;
+ work->iov_cnt++;
+ }
+
+ ksmbd_realloc_iov_pin(work, ib, len);
+ inc_rfc1001_len(work->iov[0].iov_base, len);
+
+ if (aux_size) {
+ struct aux_read *ar;
+
+ ksmbd_realloc_iov_pin(work, aux_buf, aux_size);
+ inc_rfc1001_len(work->iov[0].iov_base, aux_size);
+
+ ar = kmalloc(sizeof(struct aux_read), GFP_KERNEL);
+ if (!ar)
+ return -ENOMEM;
+
+ ar->buf = aux_buf;
+ list_add(&ar->entry, &work->aux_read_list);
+ }
+
+ return 0;
+}
+
+int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len)
+{
+ return __ksmbd_iov_pin_rsp(work, ib, len, NULL, 0);
+}
+
+int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len,
+ void *aux_buf, unsigned int aux_size)
+{
+ return __ksmbd_iov_pin_rsp(work, ib, len, aux_buf, aux_size);
+}
+
+int allocate_interim_rsp_buf(struct ksmbd_work *work)
+{
+ work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL);
+ if (!work->response_buf)
+ return -ENOMEM;
+ work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
+ return 0;
+}
diff --git a/fs/smb/server/ksmbd_work.h b/fs/smb/server/ksmbd_work.h
index f8ae6144c0ae..8ca2c813246e 100644
--- a/fs/smb/server/ksmbd_work.h
+++ b/fs/smb/server/ksmbd_work.h
@@ -19,6 +19,11 @@ enum {
KSMBD_WORK_CLOSED,
};
+struct aux_read {
+ void *buf;
+ struct list_head entry;
+};
+
/* one of these for every pending CIFS request at the connection */
struct ksmbd_work {
/* Server corresponding to this mid */
@@ -31,13 +36,19 @@ struct ksmbd_work {
/* Response buffer */
void *response_buf;
- /* Read data buffer */
- void *aux_payload_buf;
+ struct list_head aux_read_list;
+
+ struct kvec *iov;
+ int iov_alloc_cnt;
+ int iov_cnt;
+ int iov_idx;
/* Next cmd hdr in compound req buf*/
int next_smb2_rcv_hdr_off;
/* Next cmd hdr in compound rsp buf*/
int next_smb2_rsp_hdr_off;
+ /* Current cmd hdr in compound rsp buf*/
+ int curr_smb2_rsp_hdr_off;
/*
* Current Local FID assigned compound response if SMB2 CREATE
@@ -53,16 +64,11 @@ struct ksmbd_work {
unsigned int credits_granted;
/* response smb header size */
- unsigned int resp_hdr_sz;
unsigned int response_sz;
- /* Read data count */
- unsigned int aux_payload_sz;
void *tr_buf;
unsigned char state;
- /* Multiple responses for one request e.g. SMB ECHO */
- bool multiRsp:1;
/* No response for cancelled request */
bool send_no_response:1;
/* Request is encrypted */
@@ -96,6 +102,15 @@ static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work)
}
/**
+ * ksmbd_resp_buf_curr - Get current buffer on compound response.
+ * @work: smb work containing response buffer
+ */
+static inline void *ksmbd_resp_buf_curr(struct ksmbd_work *work)
+{
+ return work->response_buf + work->curr_smb2_rsp_hdr_off + 4;
+}
+
+/**
* ksmbd_req_buf_next - Get next buffer on compound request.
* @work: smb work containing response buffer
*/
@@ -113,5 +128,8 @@ int ksmbd_work_pool_init(void);
int ksmbd_workqueue_init(void);
void ksmbd_workqueue_destroy(void);
bool ksmbd_queue_work(struct ksmbd_work *work);
-
+int ksmbd_iov_pin_rsp_read(struct ksmbd_work *work, void *ib, int len,
+ void *aux_buf, unsigned int aux_size);
+int ksmbd_iov_pin_rsp(struct ksmbd_work *work, void *ib, int len);
+int allocate_interim_rsp_buf(struct ksmbd_work *work);
#endif /* __KSMBD_WORK_H__ */
diff --git a/fs/smb/server/mgmt/share_config.h b/fs/smb/server/mgmt/share_config.h
index 3fd338293942..5f591751b923 100644
--- a/fs/smb/server/mgmt/share_config.h
+++ b/fs/smb/server/mgmt/share_config.h
@@ -34,29 +34,22 @@ struct ksmbd_share_config {
#define KSMBD_SHARE_INVALID_UID ((__u16)-1)
#define KSMBD_SHARE_INVALID_GID ((__u16)-1)
-static inline int share_config_create_mode(struct ksmbd_share_config *share,
- umode_t posix_mode)
+static inline umode_t
+share_config_create_mode(struct ksmbd_share_config *share,
+ umode_t posix_mode)
{
- if (!share->force_create_mode) {
- if (!posix_mode)
- return share->create_mask;
- else
- return posix_mode & share->create_mask;
- }
- return share->force_create_mode & share->create_mask;
+ umode_t mode = (posix_mode ?: (umode_t)-1) & share->create_mask;
+
+ return mode | share->force_create_mode;
}
-static inline int share_config_directory_mode(struct ksmbd_share_config *share,
- umode_t posix_mode)
+static inline umode_t
+share_config_directory_mode(struct ksmbd_share_config *share,
+ umode_t posix_mode)
{
- if (!share->force_directory_mode) {
- if (!posix_mode)
- return share->directory_mask;
- else
- return posix_mode & share->directory_mask;
- }
+ umode_t mode = (posix_mode ?: (umode_t)-1) & share->directory_mask;
- return share->force_directory_mode & share->directory_mask;
+ return mode | share->force_directory_mode;
}
static inline int test_share_config_flag(struct ksmbd_share_config *share,
diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c
index 408cddf2f094..d2c81a8a11dd 100644
--- a/fs/smb/server/mgmt/tree_connect.c
+++ b/fs/smb/server/mgmt/tree_connect.c
@@ -73,7 +73,10 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
tree_conn->user = sess->user;
tree_conn->share_conf = sc;
+ tree_conn->t_state = TREE_NEW;
status.tree_conn = tree_conn;
+ atomic_set(&tree_conn->refcount, 1);
+ init_waitqueue_head(&tree_conn->refcount_q);
ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn,
GFP_KERNEL));
@@ -93,14 +96,33 @@ out_error:
return status;
}
+void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon)
+{
+ /*
+ * Checking waitqueue to releasing tree connect on
+ * tree disconnect. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ if (!atomic_dec_return(&tcon->refcount) &&
+ waitqueue_active(&tcon->refcount_q))
+ wake_up(&tcon->refcount_q);
+}
+
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn)
{
int ret;
+ write_lock(&sess->tree_conns_lock);
+ xa_erase(&sess->tree_conns, tree_conn->id);
+ write_unlock(&sess->tree_conns_lock);
+
+ if (!atomic_dec_and_test(&tree_conn->refcount))
+ wait_event(tree_conn->refcount_q,
+ atomic_read(&tree_conn->refcount) == 0);
+
ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id);
ksmbd_release_tree_conn_id(sess, tree_conn->id);
- xa_erase(&sess->tree_conns, tree_conn->id);
ksmbd_share_config_put(tree_conn->share_conf);
kfree(tree_conn);
return ret;
@@ -111,11 +133,15 @@ struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess,
{
struct ksmbd_tree_connect *tcon;
+ read_lock(&sess->tree_conns_lock);
tcon = xa_load(&sess->tree_conns, id);
if (tcon) {
- if (test_bit(TREE_CONN_EXPIRE, &tcon->status))
+ if (tcon->t_state != TREE_CONNECTED)
+ tcon = NULL;
+ else if (!atomic_inc_not_zero(&tcon->refcount))
tcon = NULL;
}
+ read_unlock(&sess->tree_conns_lock);
return tcon;
}
@@ -129,8 +155,18 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess)
if (!sess)
return -EINVAL;
- xa_for_each(&sess->tree_conns, id, tc)
+ xa_for_each(&sess->tree_conns, id, tc) {
+ write_lock(&sess->tree_conns_lock);
+ if (tc->t_state == TREE_DISCONNECTED) {
+ write_unlock(&sess->tree_conns_lock);
+ ret = -ENOENT;
+ continue;
+ }
+ tc->t_state = TREE_DISCONNECTED;
+ write_unlock(&sess->tree_conns_lock);
+
ret |= ksmbd_tree_conn_disconnect(sess, tc);
+ }
xa_destroy(&sess->tree_conns);
return ret;
}
diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h
index 562d647ad9fa..6377a70b811c 100644
--- a/fs/smb/server/mgmt/tree_connect.h
+++ b/fs/smb/server/mgmt/tree_connect.h
@@ -14,7 +14,11 @@ struct ksmbd_share_config;
struct ksmbd_user;
struct ksmbd_conn;
-#define TREE_CONN_EXPIRE 1
+enum {
+ TREE_NEW = 0,
+ TREE_CONNECTED,
+ TREE_DISCONNECTED
+};
struct ksmbd_tree_connect {
int id;
@@ -27,7 +31,9 @@ struct ksmbd_tree_connect {
int maximal_access;
bool posix_extensions;
- unsigned long status;
+ atomic_t refcount;
+ wait_queue_head_t refcount_q;
+ unsigned int t_state;
};
struct ksmbd_tree_conn_status {
@@ -46,6 +52,7 @@ struct ksmbd_session;
struct ksmbd_tree_conn_status
ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess,
const char *share_name);
+void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon);
int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess,
struct ksmbd_tree_connect *tree_conn);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 8a5dcab05614..15f68ee05089 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -174,7 +174,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
unsigned long id;
struct ksmbd_session *sess;
- down_write(&sessions_table_lock);
+ down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
if (sess->state != SMB2_SESSION_VALID ||
time_after(jiffies,
@@ -185,7 +185,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
continue;
}
}
- up_write(&sessions_table_lock);
+ up_write(&conn->session_lock);
}
int ksmbd_session_register(struct ksmbd_conn *conn,
@@ -227,7 +227,9 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
}
}
}
+ up_write(&sessions_table_lock);
+ down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
unsigned long chann_id;
struct channel *chann;
@@ -244,7 +246,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
ksmbd_session_destroy(sess);
}
}
- up_write(&sessions_table_lock);
+ up_write(&conn->session_lock);
}
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
@@ -252,9 +254,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
{
struct ksmbd_session *sess;
+ down_read(&conn->session_lock);
sess = xa_load(&conn->sessions, id);
if (sess)
sess->last_active = jiffies;
+ up_read(&conn->session_lock);
return sess;
}
@@ -351,6 +355,7 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->ksmbd_chann_list);
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
+ rwlock_init(&sess->tree_conns_lock);
ret = __init_smb2_session(sess);
if (ret)
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index f99d475b28db..63cb08fffde8 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -60,6 +60,7 @@ struct ksmbd_session {
struct ksmbd_file_table file_table;
unsigned long last_active;
+ rwlock_t tree_conns_lock;
};
static inline int test_session_flag(struct ksmbd_session *sess, int bit)
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 844b303baf29..9bc0103720f5 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -616,15 +616,6 @@ static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level)
return 0;
}
-static inline int allocate_oplock_break_buf(struct ksmbd_work *work)
-{
- work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL);
- if (!work->response_buf)
- return -ENOMEM;
- work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE;
- return 0;
-}
-
/**
* __smb2_oplock_break_noti() - send smb2 oplock break cmd from conn
* to client
@@ -639,7 +630,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
{
struct smb2_oplock_break *rsp = NULL;
struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work);
- struct ksmbd_conn *conn = work->conn;
struct oplock_break_info *br_info = work->request_buf;
struct smb2_hdr *rsp_hdr;
struct ksmbd_file *fp;
@@ -648,7 +638,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
if (!fp)
goto out;
- if (allocate_oplock_break_buf(work)) {
+ if (allocate_interim_rsp_buf(work)) {
pr_err("smb2_allocate_rsp_buf failed! ");
ksmbd_fd_put(work, fp);
goto out;
@@ -656,8 +646,6 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
rsp_hdr = smb2_get_msg(work->response_buf);
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
- *(__be32 *)work->response_buf =
- cpu_to_be32(conn->vals->header_size);
rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->CreditRequest = cpu_to_le16(0);
@@ -684,13 +672,15 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
rsp->PersistentFid = fp->persistent_id;
rsp->VolatileFid = fp->volatile_id;
- inc_rfc1001_len(work->response_buf, 24);
+ ksmbd_fd_put(work, fp);
+ if (ksmbd_iov_pin_rsp(work, (void *)rsp,
+ sizeof(struct smb2_oplock_break)))
+ goto out;
ksmbd_debug(OPLOCK,
"sending oplock break v_id %llu p_id = %llu lock level = %d\n",
rsp->VolatileFid, rsp->PersistentFid, rsp->OplockLevel);
- ksmbd_fd_put(work, fp);
ksmbd_conn_write(work);
out:
@@ -751,18 +741,15 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
struct smb2_lease_break *rsp = NULL;
struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work);
struct lease_break_info *br_info = work->request_buf;
- struct ksmbd_conn *conn = work->conn;
struct smb2_hdr *rsp_hdr;
- if (allocate_oplock_break_buf(work)) {
+ if (allocate_interim_rsp_buf(work)) {
ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! ");
goto out;
}
rsp_hdr = smb2_get_msg(work->response_buf);
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
- *(__be32 *)work->response_buf =
- cpu_to_be32(conn->vals->header_size);
rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->CreditRequest = cpu_to_le16(0);
@@ -791,7 +778,9 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
rsp->AccessMaskHint = 0;
rsp->ShareMaskHint = 0;
- inc_rfc1001_len(work->response_buf, 44);
+ if (ksmbd_iov_pin_rsp(work, (void *)rsp,
+ sizeof(struct smb2_lease_break)))
+ goto out;
ksmbd_conn_write(work);
@@ -1492,7 +1481,7 @@ struct create_context *smb2_find_context_vals(void *open_req, const char *tag, i
name_len < 4 ||
name_off + name_len > cc_len ||
(value_off & 0x7) != 0 ||
- (value_off && (value_off < name_off + name_len)) ||
+ (value_len && value_off < name_off + (name_len < 8 ? 8 : name_len)) ||
((u64)value_off + value_len > cc_len))
return ERR_PTR(-EINVAL);
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 9df121bdf349..3079e607c5fe 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -115,8 +115,10 @@ static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn,
if (check_conn_state(work))
return SERVER_HANDLER_CONTINUE;
- if (ksmbd_verify_smb_message(work))
+ if (ksmbd_verify_smb_message(work)) {
+ conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
return SERVER_HANDLER_ABORT;
+ }
command = conn->ops->get_cmd_val(work);
*cmd = command;
@@ -163,6 +165,7 @@ static void __handle_ksmbd_work(struct ksmbd_work *work,
{
u16 command = 0;
int rc;
+ bool is_chained = false;
if (conn->ops->allocate_rsp_buf(work))
return;
@@ -229,16 +232,17 @@ static void __handle_ksmbd_work(struct ksmbd_work *work,
}
}
+ is_chained = is_chained_smb2_message(work);
+
if (work->sess &&
(work->sess->sign || smb3_11_final_sess_setup_resp(work) ||
conn->ops->is_sign_req(work, command)))
conn->ops->set_sign_rsp(work);
- } while (is_chained_smb2_message(work));
-
- if (work->send_no_response)
- return;
+ } while (is_chained == true);
send:
+ if (work->tcon)
+ ksmbd_tree_connect_put(work->tcon);
smb3_preauth_hash_rsp(work);
if (work->sess && work->sess->enc && work->encrypted &&
conn->ops->encrypt_resp) {
@@ -590,8 +594,6 @@ static int __init ksmbd_server_init(void)
if (ret)
goto err_crypto_destroy;
- pr_warn_once("The ksmbd server is experimental\n");
-
return 0;
err_crypto_destroy:
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index e881df1d10cb..23bd3d1209df 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -440,10 +440,8 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work)
validate_credit:
if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) &&
- smb2_validate_credit_charge(work->conn, hdr)) {
- work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER);
+ smb2_validate_credit_charge(work->conn, hdr))
return 1;
- }
return 0;
}
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 7cc1b0c47d0a..93262ca3f58a 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -145,12 +145,18 @@ void smb2_set_err_rsp(struct ksmbd_work *work)
err_rsp = smb2_get_msg(work->response_buf);
if (err_rsp->hdr.Status != STATUS_STOPPED_ON_SYMLINK) {
+ int err;
+
err_rsp->StructureSize = SMB2_ERROR_STRUCTURE_SIZE2_LE;
err_rsp->ErrorContextCount = 0;
err_rsp->Reserved = 0;
err_rsp->ByteCount = 0;
err_rsp->ErrorData[0] = 0;
- inc_rfc1001_len(work->response_buf, SMB2_ERROR_STRUCTURE_SIZE2);
+ err = ksmbd_iov_pin_rsp(work, (void *)err_rsp,
+ __SMB2_HEADER_STRUCTURE_SIZE +
+ SMB2_ERROR_STRUCTURE_SIZE2);
+ if (err)
+ work->send_no_response = 1;
}
}
@@ -225,11 +231,12 @@ void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err)
{
struct smb2_hdr *rsp_hdr;
- if (work->next_smb2_rcv_hdr_off)
- rsp_hdr = ksmbd_resp_buf_next(work);
- else
- rsp_hdr = smb2_get_msg(work->response_buf);
+ rsp_hdr = smb2_get_msg(work->response_buf);
rsp_hdr->Status = err;
+
+ work->iov_idx = 0;
+ work->iov_cnt = 0;
+ work->next_smb2_rcv_hdr_off = 0;
smb2_set_err_rsp(work);
}
@@ -245,9 +252,7 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
struct smb2_hdr *rsp_hdr;
struct smb2_negotiate_rsp *rsp;
struct ksmbd_conn *conn = work->conn;
-
- *(__be32 *)work->response_buf =
- cpu_to_be32(conn->vals->header_size);
+ int err;
rsp_hdr = smb2_get_msg(work->response_buf);
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
@@ -286,12 +291,13 @@ int init_smb2_neg_rsp(struct ksmbd_work *work)
rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH);
ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) +
le16_to_cpu(rsp->SecurityBufferOffset));
- inc_rfc1001_len(work->response_buf,
- sizeof(struct smb2_negotiate_rsp) -
- sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH);
rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY)
rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE;
+ err = ksmbd_iov_pin_rsp(work, rsp,
+ sizeof(struct smb2_negotiate_rsp) + AUTH_GSS_LENGTH);
+ if (err)
+ return err;
conn->use_spnego = true;
ksmbd_conn_set_need_negotiate(conn);
@@ -390,11 +396,12 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work)
next_hdr_offset = le32_to_cpu(req->NextCommand);
new_len = ALIGN(len, 8);
- inc_rfc1001_len(work->response_buf,
- sizeof(struct smb2_hdr) + new_len - len);
+ work->iov[work->iov_idx].iov_len += (new_len - len);
+ inc_rfc1001_len(work->response_buf, new_len - len);
rsp->NextCommand = cpu_to_le32(new_len);
work->next_smb2_rcv_hdr_off += next_hdr_offset;
+ work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off;
work->next_smb2_rsp_hdr_off += new_len;
ksmbd_debug(SMB,
"Compound req new_len = %d rcv off = %d rsp off = %d\n",
@@ -470,10 +477,10 @@ bool is_chained_smb2_message(struct ksmbd_work *work)
len = len - get_rfc1002_len(work->response_buf);
if (len) {
ksmbd_debug(SMB, "padding len %u\n", len);
+ work->iov[work->iov_idx].iov_len += len;
inc_rfc1001_len(work->response_buf, len);
- if (work->aux_payload_sz)
- work->aux_payload_sz += len;
}
+ work->curr_smb2_rsp_hdr_off = work->next_smb2_rsp_hdr_off;
}
return false;
}
@@ -488,11 +495,8 @@ int init_smb2_rsp_hdr(struct ksmbd_work *work)
{
struct smb2_hdr *rsp_hdr = smb2_get_msg(work->response_buf);
struct smb2_hdr *rcv_hdr = smb2_get_msg(work->request_buf);
- struct ksmbd_conn *conn = work->conn;
memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2);
- *(__be32 *)work->response_buf =
- cpu_to_be32(conn->vals->header_size);
rsp_hdr->ProtocolId = rcv_hdr->ProtocolId;
rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE;
rsp_hdr->Command = rcv_hdr->Command;
@@ -657,7 +661,7 @@ int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg)
struct ksmbd_conn *conn = work->conn;
int id;
- rsp_hdr = smb2_get_msg(work->response_buf);
+ rsp_hdr = ksmbd_resp_buf_next(work);
rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND;
id = ksmbd_acquire_async_msg_id(&conn->async_ida);
@@ -706,15 +710,24 @@ void release_async_work(struct ksmbd_work *work)
void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status)
{
struct smb2_hdr *rsp_hdr;
+ struct ksmbd_work *in_work = ksmbd_alloc_work_struct();
- rsp_hdr = smb2_get_msg(work->response_buf);
- smb2_set_err_rsp(work);
+ if (allocate_interim_rsp_buf(in_work)) {
+ pr_err("smb_allocate_rsp_buf failed!\n");
+ ksmbd_free_work_struct(in_work);
+ return;
+ }
+
+ in_work->conn = work->conn;
+ memcpy(smb2_get_msg(in_work->response_buf), ksmbd_resp_buf_next(work),
+ __SMB2_HEADER_STRUCTURE_SIZE);
+
+ rsp_hdr = smb2_get_msg(in_work->response_buf);
+ smb2_set_err_rsp(in_work);
rsp_hdr->Status = status;
- work->multiRsp = 1;
- ksmbd_conn_write(work);
- rsp_hdr->Status = 0;
- work->multiRsp = 0;
+ ksmbd_conn_write(in_work);
+ ksmbd_free_work_struct(in_work);
}
static __le32 smb2_get_reparse_tag_special_file(umode_t mode)
@@ -821,9 +834,8 @@ static void build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt)
pneg_ctxt->Name[15] = 0x7C;
}
-static void assemble_neg_contexts(struct ksmbd_conn *conn,
- struct smb2_negotiate_rsp *rsp,
- void *smb2_buf_len)
+static unsigned int assemble_neg_contexts(struct ksmbd_conn *conn,
+ struct smb2_negotiate_rsp *rsp)
{
char * const pneg_ctxt = (char *)rsp +
le32_to_cpu(rsp->NegotiateContextOffset);
@@ -834,7 +846,6 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
"assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n");
build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt,
conn->preauth_info->Preauth_HashId);
- inc_rfc1001_len(smb2_buf_len, AUTH_GSS_PADDING);
ctxt_size = sizeof(struct smb2_preauth_neg_context);
if (conn->cipher_type) {
@@ -874,7 +885,7 @@ static void assemble_neg_contexts(struct ksmbd_conn *conn,
}
rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt);
- inc_rfc1001_len(smb2_buf_len, ctxt_size);
+ return ctxt_size + AUTH_GSS_PADDING;
}
static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn,
@@ -1090,7 +1101,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
struct smb2_negotiate_req *req = smb2_get_msg(work->request_buf);
struct smb2_negotiate_rsp *rsp = smb2_get_msg(work->response_buf);
int rc = 0;
- unsigned int smb2_buf_len, smb2_neg_size;
+ unsigned int smb2_buf_len, smb2_neg_size, neg_ctxt_len = 0;
__le32 status;
ksmbd_debug(SMB, "Received negotiate request\n");
@@ -1183,7 +1194,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
conn->preauth_info->Preauth_HashValue);
rsp->NegotiateContextOffset =
cpu_to_le32(OFFSET_OF_NEG_CONTEXT);
- assemble_neg_contexts(conn, rsp, work->response_buf);
+ neg_ctxt_len = assemble_neg_contexts(conn, rsp);
break;
case SMB302_PROT_ID:
init_smb3_02_server(conn);
@@ -1233,8 +1244,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH);
ksmbd_copy_gss_neg_header((char *)(&rsp->hdr) +
le16_to_cpu(rsp->SecurityBufferOffset));
- inc_rfc1001_len(work->response_buf, sizeof(struct smb2_negotiate_rsp) -
- sizeof(struct smb2_hdr) + AUTH_GSS_LENGTH);
+
rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE;
conn->use_spnego = true;
@@ -1252,9 +1262,15 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
ksmbd_conn_set_need_negotiate(conn);
err_out:
+ if (rc)
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+
+ if (!rc)
+ rc = ksmbd_iov_pin_rsp(work, rsp,
+ sizeof(struct smb2_negotiate_rsp) +
+ AUTH_GSS_LENGTH + neg_ctxt_len);
if (rc < 0)
smb2_set_err_rsp(work);
-
return rc;
}
@@ -1454,7 +1470,6 @@ static int ntlm_authenticate(struct ksmbd_work *work,
memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len);
rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len);
kfree(spnego_blob);
- inc_rfc1001_len(work->response_buf, spnego_blob_len - 1);
}
user = session_user(conn, req);
@@ -1600,7 +1615,6 @@ static int krb5_authenticate(struct ksmbd_work *work,
return -EINVAL;
}
rsp->SecurityBufferLength = cpu_to_le16(out_len);
- inc_rfc1001_len(work->response_buf, out_len - 1);
if ((conn->sign || server_conf.enforced_signing) ||
(req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
@@ -1672,7 +1686,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
rsp->SessionFlags = 0;
rsp->SecurityBufferOffset = cpu_to_le16(72);
rsp->SecurityBufferLength = 0;
- inc_rfc1001_len(work->response_buf, 9);
ksmbd_conn_lock(conn);
if (!req->hdr.SessionId) {
@@ -1808,13 +1821,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
goto out_err;
rsp->hdr.Status =
STATUS_MORE_PROCESSING_REQUIRED;
- /*
- * Note: here total size -1 is done as an
- * adjustment for 0 size blob
- */
- inc_rfc1001_len(work->response_buf,
- le16_to_cpu(rsp->SecurityBufferLength) - 1);
-
} else if (negblob->MessageType == NtLmAuthenticate) {
rc = ntlm_authenticate(work, req, rsp);
if (rc)
@@ -1899,6 +1905,18 @@ out_err:
ksmbd_conn_set_need_negotiate(conn);
}
}
+ smb2_set_err_rsp(work);
+ } else {
+ unsigned int iov_len;
+
+ if (rsp->SecurityBufferLength)
+ iov_len = offsetof(struct smb2_sess_setup_rsp, Buffer) +
+ le16_to_cpu(rsp->SecurityBufferLength);
+ else
+ iov_len = sizeof(struct smb2_sess_setup_rsp);
+ rc = ksmbd_iov_pin_rsp(work, rsp, iov_len);
+ if (rc)
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
}
ksmbd_conn_unlock(conn);
@@ -1976,14 +1994,20 @@ int smb2_tree_connect(struct ksmbd_work *work)
if (conn->posix_ext_supported)
status.tree_conn->posix_extensions = true;
+ write_lock(&sess->tree_conns_lock);
+ status.tree_conn->t_state = TREE_CONNECTED;
+ write_unlock(&sess->tree_conns_lock);
rsp->StructureSize = cpu_to_le16(16);
- inc_rfc1001_len(work->response_buf, 16);
out_err1:
rsp->Capabilities = 0;
rsp->Reserved = 0;
/* default manual caching */
rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING;
+ rc = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_tree_connect_rsp));
+ if (rc)
+ status.ret = KSMBD_TREE_CONN_STATUS_NOMEM;
+
if (!IS_ERR(treename))
kfree(treename);
if (!IS_ERR(name))
@@ -2096,26 +2120,56 @@ int smb2_tree_disconnect(struct ksmbd_work *work)
struct smb2_tree_disconnect_req *req;
struct ksmbd_session *sess = work->sess;
struct ksmbd_tree_connect *tcon = work->tcon;
+ int err;
WORK_BUFFERS(work, req, rsp);
- rsp->StructureSize = cpu_to_le16(4);
- inc_rfc1001_len(work->response_buf, 4);
-
ksmbd_debug(SMB, "request\n");
- if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) {
+ if (!tcon) {
ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
- smb2_set_err_rsp(work);
- return 0;
+ err = -ENOENT;
+ goto err_out;
}
ksmbd_close_tree_conn_fds(work);
- ksmbd_tree_conn_disconnect(sess, tcon);
+
+ write_lock(&sess->tree_conns_lock);
+ if (tcon->t_state == TREE_DISCONNECTED) {
+ write_unlock(&sess->tree_conns_lock);
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ err = -ENOENT;
+ goto err_out;
+ }
+
+ WARN_ON_ONCE(atomic_dec_and_test(&tcon->refcount));
+ tcon->t_state = TREE_DISCONNECTED;
+ write_unlock(&sess->tree_conns_lock);
+
+ err = ksmbd_tree_conn_disconnect(sess, tcon);
+ if (err) {
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ goto err_out;
+ }
+
work->tcon = NULL;
+
+ rsp->StructureSize = cpu_to_le16(4);
+ err = ksmbd_iov_pin_rsp(work, rsp,
+ sizeof(struct smb2_tree_disconnect_rsp));
+ if (err) {
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+ goto err_out;
+ }
+
return 0;
+
+err_out:
+ smb2_set_err_rsp(work);
+ return err;
+
}
/**
@@ -2131,17 +2185,23 @@ int smb2_session_logoff(struct ksmbd_work *work)
struct smb2_logoff_rsp *rsp;
struct ksmbd_session *sess;
u64 sess_id;
+ int err;
WORK_BUFFERS(work, req, rsp);
- sess_id = le64_to_cpu(req->hdr.SessionId);
-
- rsp->StructureSize = cpu_to_le16(4);
- inc_rfc1001_len(work->response_buf, 4);
-
ksmbd_debug(SMB, "request\n");
+ ksmbd_conn_lock(conn);
+ if (!ksmbd_conn_good(conn)) {
+ ksmbd_conn_unlock(conn);
+ rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
+ smb2_set_err_rsp(work);
+ return -ENOENT;
+ }
+ sess_id = le64_to_cpu(req->hdr.SessionId);
ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT);
+ ksmbd_conn_unlock(conn);
+
ksmbd_close_session_fds(work);
ksmbd_conn_wait_idle(conn, sess_id);
@@ -2154,7 +2214,7 @@ int smb2_session_logoff(struct ksmbd_work *work)
ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId);
rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED;
smb2_set_err_rsp(work);
- return 0;
+ return -ENOENT;
}
ksmbd_destroy_file_table(&sess->file_table);
@@ -2163,6 +2223,14 @@ int smb2_session_logoff(struct ksmbd_work *work)
ksmbd_free_user(sess->user);
sess->user = NULL;
ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE);
+
+ rsp->StructureSize = cpu_to_le16(4);
+ err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp));
+ if (err) {
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
+ smb2_set_err_rsp(work);
+ return err;
+ }
return 0;
}
@@ -2215,7 +2283,10 @@ static noinline int create_smb2_pipe(struct ksmbd_work *work)
rsp->CreateContextsOffset = 0;
rsp->CreateContextsLength = 0;
- inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/
+ err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_create_rsp, Buffer));
+ if (err)
+ goto out;
+
kfree(name);
return 0;
@@ -2597,6 +2668,7 @@ int smb2_open(struct ksmbd_work *work)
u64 time;
umode_t posix_mode = 0;
__le32 daccess, maximal_access = 0;
+ int iov_len = 0;
WORK_BUFFERS(work, req, rsp);
@@ -3248,7 +3320,7 @@ int smb2_open(struct ksmbd_work *work)
rsp->CreateContextsOffset = 0;
rsp->CreateContextsLength = 0;
- inc_rfc1001_len(work->response_buf, 88); /* StructureSize - 1*/
+ iov_len = offsetof(struct smb2_create_rsp, Buffer);
/* If lease is request send lease context response */
if (opinfo && opinfo->is_lease) {
@@ -3263,8 +3335,7 @@ int smb2_open(struct ksmbd_work *work)
create_lease_buf(rsp->Buffer, opinfo->o_lease);
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_lease_size);
- inc_rfc1001_len(work->response_buf,
- conn->vals->create_lease_size);
+ iov_len += conn->vals->create_lease_size;
next_ptr = &lease_ccontext->Next;
next_off = conn->vals->create_lease_size;
}
@@ -3284,8 +3355,7 @@ int smb2_open(struct ksmbd_work *work)
le32_to_cpu(maximal_access));
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_mxac_size);
- inc_rfc1001_len(work->response_buf,
- conn->vals->create_mxac_size);
+ iov_len += conn->vals->create_mxac_size;
if (next_ptr)
*next_ptr = cpu_to_le32(next_off);
next_ptr = &mxac_ccontext->Next;
@@ -3303,8 +3373,7 @@ int smb2_open(struct ksmbd_work *work)
stat.ino, tcon->id);
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_disk_id_size);
- inc_rfc1001_len(work->response_buf,
- conn->vals->create_disk_id_size);
+ iov_len += conn->vals->create_disk_id_size;
if (next_ptr)
*next_ptr = cpu_to_le32(next_off);
next_ptr = &disk_id_ccontext->Next;
@@ -3318,8 +3387,7 @@ int smb2_open(struct ksmbd_work *work)
fp);
le32_add_cpu(&rsp->CreateContextsLength,
conn->vals->create_posix_size);
- inc_rfc1001_len(work->response_buf,
- conn->vals->create_posix_size);
+ iov_len += conn->vals->create_posix_size;
if (next_ptr)
*next_ptr = cpu_to_le32(next_off);
}
@@ -3337,7 +3405,10 @@ err_out:
}
ksmbd_revert_fsids(work);
err_out1:
-
+ if (!rc) {
+ ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED);
+ rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len);
+ }
if (rc) {
if (rc == -EINVAL)
rsp->hdr.Status = STATUS_INVALID_PARAMETER;
@@ -4063,7 +4134,10 @@ int smb2_query_dir(struct ksmbd_work *work)
rsp->OutputBufferOffset = cpu_to_le16(0);
rsp->OutputBufferLength = cpu_to_le32(0);
rsp->Buffer[0] = 0;
- inc_rfc1001_len(work->response_buf, 9);
+ rc = ksmbd_iov_pin_rsp(work, (void *)rsp,
+ sizeof(struct smb2_query_directory_rsp));
+ if (rc)
+ goto err_out;
} else {
no_buf_len:
((struct file_directory_info *)
@@ -4075,7 +4149,11 @@ no_buf_len:
rsp->StructureSize = cpu_to_le16(9);
rsp->OutputBufferOffset = cpu_to_le16(72);
rsp->OutputBufferLength = cpu_to_le32(d_info.data_count);
- inc_rfc1001_len(work->response_buf, 8 + d_info.data_count);
+ rc = ksmbd_iov_pin_rsp(work, (void *)rsp,
+ offsetof(struct smb2_query_directory_rsp, Buffer) +
+ d_info.data_count);
+ if (rc)
+ goto err_out;
}
kfree(srch_ptr);
@@ -4116,27 +4194,18 @@ err_out2:
* @reqOutputBufferLength: max buffer length expected in command response
* @rsp: query info response buffer contains output buffer length
* @rsp_org: base response buffer pointer in case of chained response
- * @infoclass_size: query info class response buffer size
*
* Return: 0 on success, otherwise error
*/
static int buffer_check_err(int reqOutputBufferLength,
struct smb2_query_info_rsp *rsp,
- void *rsp_org, int infoclass_size)
+ void *rsp_org)
{
if (reqOutputBufferLength < le32_to_cpu(rsp->OutputBufferLength)) {
- if (reqOutputBufferLength < infoclass_size) {
- pr_err("Invalid Buffer Size Requested\n");
- rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH;
- *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr));
- return -EINVAL;
- }
-
- ksmbd_debug(SMB, "Buffer Overflow\n");
- rsp->hdr.Status = STATUS_BUFFER_OVERFLOW;
- *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr) +
- reqOutputBufferLength);
- rsp->OutputBufferLength = cpu_to_le32(reqOutputBufferLength);
+ pr_err("Invalid Buffer Size Requested\n");
+ rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH;
+ *(__be32 *)rsp_org = cpu_to_be32(sizeof(struct smb2_hdr));
+ return -EINVAL;
}
return 0;
}
@@ -4155,7 +4224,6 @@ static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp,
sinfo->Directory = 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_standard_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_standard_info));
}
static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num,
@@ -4169,7 +4237,6 @@ static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num,
file_info->IndexNumber = cpu_to_le64(num | (1ULL << 63));
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_internal_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info));
}
static int smb2_get_info_file_pipe(struct ksmbd_session *sess,
@@ -4195,14 +4262,12 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess,
case FILE_STANDARD_INFORMATION:
get_standard_info_pipe(rsp, rsp_org);
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp, rsp_org,
- FILE_STANDARD_INFORMATION_SIZE);
+ rsp, rsp_org);
break;
case FILE_INTERNAL_INFORMATION:
get_internal_info_pipe(rsp, id, rsp_org);
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp, rsp_org,
- FILE_INTERNAL_INFORMATION_SIZE);
+ rsp, rsp_org);
break;
default:
ksmbd_debug(SMB, "smb2_info_file_pipe for %u not supported\n",
@@ -4308,7 +4373,7 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
name_len -= XATTR_USER_PREFIX_LEN;
- ptr = (char *)(&eainfo->name + name_len + 1);
+ ptr = eainfo->name + name_len + 1;
buf_free_len -= (offsetof(struct smb2_ea_info, name) +
name_len + 1);
/* bailout if xattr can't fit in buf_free_len */
@@ -4370,7 +4435,6 @@ done:
if (rsp_data_cnt == 0)
rsp->hdr.Status = STATUS_NO_EAS_ON_FILE;
rsp->OutputBufferLength = cpu_to_le32(rsp_data_cnt);
- inc_rfc1001_len(rsp_org, rsp_data_cnt);
out:
kvfree(xattr_list);
return rc;
@@ -4385,7 +4449,6 @@ static void get_file_access_info(struct smb2_query_info_rsp *rsp,
file_info->AccessFlags = fp->daccess;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_access_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_access_info));
}
static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
@@ -4402,8 +4465,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
}
basic_info = (struct smb2_file_basic_info *)rsp->Buffer;
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
basic_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
basic_info->LastAccessTime = cpu_to_le64(time);
@@ -4415,7 +4478,6 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
basic_info->Pad1 = 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_basic_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_basic_info));
return 0;
}
@@ -4428,7 +4490,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp,
struct kstat stat;
inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
delete_pending = ksmbd_inode_pending_delete(fp);
@@ -4440,8 +4502,6 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp,
sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_standard_info));
- inc_rfc1001_len(rsp_org,
- sizeof(struct smb2_file_standard_info));
}
static void get_file_alignment_info(struct smb2_query_info_rsp *rsp,
@@ -4453,8 +4513,6 @@ static void get_file_alignment_info(struct smb2_query_info_rsp *rsp,
file_info->AlignmentRequirement = 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_alignment_info));
- inc_rfc1001_len(rsp_org,
- sizeof(struct smb2_file_alignment_info));
}
static int get_file_all_info(struct ksmbd_work *work,
@@ -4482,7 +4540,7 @@ static int get_file_all_info(struct ksmbd_work *work,
return PTR_ERR(filename);
inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
ksmbd_debug(SMB, "filename = %s\n", filename);
delete_pending = ksmbd_inode_pending_delete(fp);
@@ -4518,7 +4576,6 @@ static int get_file_all_info(struct ksmbd_work *work,
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_all_info) + conv_len - 1);
kfree(filename);
- inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength));
return 0;
}
@@ -4541,7 +4598,6 @@ static void get_file_alternate_info(struct ksmbd_work *work,
file_info->FileNameLength = cpu_to_le32(conv_len);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len);
- inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength));
}
static void get_file_stream_info(struct ksmbd_work *work,
@@ -4559,8 +4615,8 @@ static void get_file_stream_info(struct ksmbd_work *work,
int buf_free_len;
struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
file_info = (struct smb2_file_stream_info *)rsp->Buffer;
buf_free_len =
@@ -4641,7 +4697,6 @@ out:
kvfree(xattr_list);
rsp->OutputBufferLength = cpu_to_le32(nbytes);
- inc_rfc1001_len(rsp_org, nbytes);
}
static void get_file_internal_info(struct smb2_query_info_rsp *rsp,
@@ -4650,13 +4705,12 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_internal_info *file_info;
struct kstat stat;
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
file_info = (struct smb2_file_internal_info *)rsp->Buffer;
file_info->IndexNumber = cpu_to_le64(stat.ino);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_internal_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info));
}
static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
@@ -4676,7 +4730,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
inode = file_inode(fp->filp);
- generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
file_info->CreationTime = cpu_to_le64(fp->create_time);
time = ksmbd_UnixTimeToNT(stat.atime);
@@ -4692,7 +4746,6 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
file_info->Reserved = cpu_to_le32(0);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_ntwrk_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ntwrk_info));
return 0;
}
@@ -4704,7 +4757,6 @@ static void get_file_ea_info(struct smb2_query_info_rsp *rsp, void *rsp_org)
file_info->EASize = 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_ea_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ea_info));
}
static void get_file_position_info(struct smb2_query_info_rsp *rsp,
@@ -4716,7 +4768,6 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp,
file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_pos_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_pos_info));
}
static void get_file_mode_info(struct smb2_query_info_rsp *rsp,
@@ -4728,7 +4779,6 @@ static void get_file_mode_info(struct smb2_query_info_rsp *rsp,
file_info->Mode = fp->coption & FILE_MODE_INFO_MASK;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_mode_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_mode_info));
}
static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
@@ -4737,8 +4787,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_comp_info *file_info;
struct kstat stat;
- generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp),
- &stat);
+ generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
+ file_inode(fp->filp), &stat);
file_info = (struct smb2_file_comp_info *)rsp->Buffer;
file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9);
@@ -4750,7 +4800,6 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_comp_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_comp_info));
}
static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
@@ -4769,11 +4818,10 @@ static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
file_info->ReparseTag = 0;
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_attr_tag_info));
- inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_attr_tag_info));
return 0;
}
-static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
+static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
struct ksmbd_file *fp, void *rsp_org)
{
struct smb311_posix_qinfo *file_info;
@@ -4790,7 +4838,7 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info->LastAccessTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode->i_mtime);
file_info->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_ctime);
+ time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
file_info->ChangeTime = cpu_to_le64(time);
file_info->DosAttributes = fp->f_ci->m_fattr;
file_info->Inode = cpu_to_le64(inode->i_ino);
@@ -4811,8 +4859,6 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]);
rsp->OutputBufferLength = cpu_to_le32(out_buf_len);
- inc_rfc1001_len(rsp_org, out_buf_len);
- return out_buf_len;
}
static int smb2_get_info_file(struct ksmbd_work *work,
@@ -4822,7 +4868,6 @@ static int smb2_get_info_file(struct ksmbd_work *work,
struct ksmbd_file *fp;
int fileinfoclass = 0;
int rc = 0;
- int file_infoclass_size;
unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
if (test_share_config_flag(work->tcon->share_conf,
@@ -4855,85 +4900,69 @@ static int smb2_get_info_file(struct ksmbd_work *work,
switch (fileinfoclass) {
case FILE_ACCESS_INFORMATION:
get_file_access_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_ACCESS_INFORMATION_SIZE;
break;
case FILE_BASIC_INFORMATION:
rc = get_file_basic_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_BASIC_INFORMATION_SIZE;
break;
case FILE_STANDARD_INFORMATION:
get_file_standard_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_STANDARD_INFORMATION_SIZE;
break;
case FILE_ALIGNMENT_INFORMATION:
get_file_alignment_info(rsp, work->response_buf);
- file_infoclass_size = FILE_ALIGNMENT_INFORMATION_SIZE;
break;
case FILE_ALL_INFORMATION:
rc = get_file_all_info(work, rsp, fp, work->response_buf);
- file_infoclass_size = FILE_ALL_INFORMATION_SIZE;
break;
case FILE_ALTERNATE_NAME_INFORMATION:
get_file_alternate_info(work, rsp, fp, work->response_buf);
- file_infoclass_size = FILE_ALTERNATE_NAME_INFORMATION_SIZE;
break;
case FILE_STREAM_INFORMATION:
get_file_stream_info(work, rsp, fp, work->response_buf);
- file_infoclass_size = FILE_STREAM_INFORMATION_SIZE;
break;
case FILE_INTERNAL_INFORMATION:
get_file_internal_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_INTERNAL_INFORMATION_SIZE;
break;
case FILE_NETWORK_OPEN_INFORMATION:
rc = get_file_network_open_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_NETWORK_OPEN_INFORMATION_SIZE;
break;
case FILE_EA_INFORMATION:
get_file_ea_info(rsp, work->response_buf);
- file_infoclass_size = FILE_EA_INFORMATION_SIZE;
break;
case FILE_FULL_EA_INFORMATION:
rc = smb2_get_ea(work, fp, req, rsp, work->response_buf);
- file_infoclass_size = FILE_FULL_EA_INFORMATION_SIZE;
break;
case FILE_POSITION_INFORMATION:
get_file_position_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_POSITION_INFORMATION_SIZE;
break;
case FILE_MODE_INFORMATION:
get_file_mode_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_MODE_INFORMATION_SIZE;
break;
case FILE_COMPRESSION_INFORMATION:
get_file_compression_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_COMPRESSION_INFORMATION_SIZE;
break;
case FILE_ATTRIBUTE_TAG_INFORMATION:
rc = get_file_attribute_tag_info(rsp, fp, work->response_buf);
- file_infoclass_size = FILE_ATTRIBUTE_TAG_INFORMATION_SIZE;
break;
case SMB_FIND_FILE_POSIX_INFO:
if (!work->tcon->posix_extensions) {
pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
rc = -EOPNOTSUPP;
} else {
- file_infoclass_size = find_file_posix_info(rsp, fp,
- work->response_buf);
+ find_file_posix_info(rsp, fp, work->response_buf);
}
break;
default:
@@ -4943,8 +4972,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
}
if (!rc)
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp, work->response_buf,
- file_infoclass_size);
+ rsp, work->response_buf);
ksmbd_fd_put(work, fp);
return rc;
}
@@ -4960,7 +4988,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
struct kstatfs stfs;
struct path path;
int rc = 0, len;
- int fs_infoclass_size = 0;
if (!share->path)
return -EIO;
@@ -4990,8 +5017,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->DeviceType = cpu_to_le32(stfs.f_type);
info->DeviceCharacteristics = cpu_to_le32(0x00000020);
rsp->OutputBufferLength = cpu_to_le32(8);
- inc_rfc1001_len(work->response_buf, 8);
- fs_infoclass_size = FS_DEVICE_INFORMATION_SIZE;
break;
}
case FS_ATTRIBUTE_INFORMATION:
@@ -5020,8 +5045,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->FileSystemNameLen = cpu_to_le32(len);
sz = sizeof(struct filesystem_attribute_info) - 2 + len;
rsp->OutputBufferLength = cpu_to_le32(sz);
- inc_rfc1001_len(work->response_buf, sz);
- fs_infoclass_size = FS_ATTRIBUTE_INFORMATION_SIZE;
break;
}
case FS_VOLUME_INFORMATION:
@@ -5048,8 +5071,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->Reserved = 0;
sz = sizeof(struct filesystem_vol_info) - 2 + len;
rsp->OutputBufferLength = cpu_to_le32(sz);
- inc_rfc1001_len(work->response_buf, sz);
- fs_infoclass_size = FS_VOLUME_INFORMATION_SIZE;
break;
}
case FS_SIZE_INFORMATION:
@@ -5062,8 +5083,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->SectorsPerAllocationUnit = cpu_to_le32(1);
info->BytesPerSector = cpu_to_le32(stfs.f_bsize);
rsp->OutputBufferLength = cpu_to_le32(24);
- inc_rfc1001_len(work->response_buf, 24);
- fs_infoclass_size = FS_SIZE_INFORMATION_SIZE;
break;
}
case FS_FULL_SIZE_INFORMATION:
@@ -5079,8 +5098,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->SectorsPerAllocationUnit = cpu_to_le32(1);
info->BytesPerSector = cpu_to_le32(stfs.f_bsize);
rsp->OutputBufferLength = cpu_to_le32(32);
- inc_rfc1001_len(work->response_buf, 32);
- fs_infoclass_size = FS_FULL_SIZE_INFORMATION_SIZE;
break;
}
case FS_OBJECT_ID_INFORMATION:
@@ -5100,8 +5117,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->extended_info.rel_date = 0;
memcpy(info->extended_info.version_string, "1.1.0", strlen("1.1.0"));
rsp->OutputBufferLength = cpu_to_le32(64);
- inc_rfc1001_len(work->response_buf, 64);
- fs_infoclass_size = FS_OBJECT_ID_INFORMATION_SIZE;
break;
}
case FS_SECTOR_SIZE_INFORMATION:
@@ -5123,8 +5138,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->ByteOffsetForSectorAlignment = 0;
info->ByteOffsetForPartitionAlignment = 0;
rsp->OutputBufferLength = cpu_to_le32(28);
- inc_rfc1001_len(work->response_buf, 28);
- fs_infoclass_size = FS_SECTOR_SIZE_INFORMATION_SIZE;
break;
}
case FS_CONTROL_INFORMATION:
@@ -5145,8 +5158,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->DefaultQuotaLimit = cpu_to_le64(SMB2_NO_FID);
info->Padding = 0;
rsp->OutputBufferLength = cpu_to_le32(48);
- inc_rfc1001_len(work->response_buf, 48);
- fs_infoclass_size = FS_CONTROL_INFORMATION_SIZE;
break;
}
case FS_POSIX_INFORMATION:
@@ -5166,8 +5177,6 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
info->TotalFileNodes = cpu_to_le64(stfs.f_files);
info->FreeFileNodes = cpu_to_le64(stfs.f_ffree);
rsp->OutputBufferLength = cpu_to_le32(56);
- inc_rfc1001_len(work->response_buf, 56);
- fs_infoclass_size = FS_POSIX_INFORMATION_SIZE;
}
break;
}
@@ -5176,8 +5185,7 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work,
return -EOPNOTSUPP;
}
rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength),
- rsp, work->response_buf,
- fs_infoclass_size);
+ rsp, work->response_buf);
path_put(&path);
return rc;
}
@@ -5211,7 +5219,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
secdesclen = sizeof(struct smb_ntsd);
rsp->OutputBufferLength = cpu_to_le32(secdesclen);
- inc_rfc1001_len(work->response_buf, secdesclen);
return 0;
}
@@ -5256,7 +5263,6 @@ static int smb2_get_info_sec(struct ksmbd_work *work,
return rc;
rsp->OutputBufferLength = cpu_to_le32(secdesclen);
- inc_rfc1001_len(work->response_buf, secdesclen);
return 0;
}
@@ -5295,6 +5301,14 @@ int smb2_query_info(struct ksmbd_work *work)
rc = -EOPNOTSUPP;
}
+ if (!rc) {
+ rsp->StructureSize = cpu_to_le16(9);
+ rsp->OutputBufferOffset = cpu_to_le16(72);
+ rc = ksmbd_iov_pin_rsp(work, (void *)rsp,
+ offsetof(struct smb2_query_info_rsp, Buffer) +
+ le32_to_cpu(rsp->OutputBufferLength));
+ }
+
if (rc < 0) {
if (rc == -EACCES)
rsp->hdr.Status = STATUS_ACCESS_DENIED;
@@ -5302,6 +5316,8 @@ int smb2_query_info(struct ksmbd_work *work)
rsp->hdr.Status = STATUS_FILE_CLOSED;
else if (rc == -EIO)
rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR;
+ else if (rc == -ENOMEM)
+ rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES;
else if (rc == -EOPNOTSUPP || rsp->hdr.Status == 0)
rsp->hdr.Status = STATUS_INVALID_INFO_CLASS;
smb2_set_err_rsp(work);
@@ -5310,9 +5326,6 @@ int smb2_query_info(struct ksmbd_work *work)
rc);
return rc;
}
- rsp->StructureSize = cpu_to_le16(9);
- rsp->OutputBufferOffset = cpu_to_le16(72);
- inc_rfc1001_len(work->response_buf, 8);
return 0;
}
@@ -5343,8 +5356,9 @@ static noinline int smb2_close_pipe(struct ksmbd_work *work)
rsp->AllocationSize = 0;
rsp->EndOfFile = 0;
rsp->Attributes = 0;
- inc_rfc1001_len(work->response_buf, 60);
- return 0;
+
+ return ksmbd_iov_pin_rsp(work, (void *)rsp,
+ sizeof(struct smb2_close_rsp));
}
/**
@@ -5433,7 +5447,7 @@ int smb2_close(struct ksmbd_work *work)
rsp->LastAccessTime = cpu_to_le64(time);
time = ksmbd_UnixTimeToNT(inode->i_mtime);
rsp->LastWriteTime = cpu_to_le64(time);
- time = ksmbd_UnixTimeToNT(inode->i_ctime);
+ time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
rsp->ChangeTime = cpu_to_le64(time);
ksmbd_fd_put(work, fp);
} else {
@@ -5449,15 +5463,17 @@ int smb2_close(struct ksmbd_work *work)
err = ksmbd_close_fd(work, volatile_id);
out:
+ if (!err)
+ err = ksmbd_iov_pin_rsp(work, (void *)rsp,
+ sizeof(struct smb2_close_rsp));
+
if (err) {
if (rsp->hdr.Status == 0)
rsp->hdr.Status = STATUS_FILE_CLOSED;
smb2_set_err_rsp(work);
- } else {
- inc_rfc1001_len(work->response_buf, 60);
}
- return 0;
+ return err;
}
/**
@@ -5475,8 +5491,7 @@ int smb2_echo(struct ksmbd_work *work)
rsp->StructureSize = cpu_to_le16(4);
rsp->Reserved = 0;
- inc_rfc1001_len(work->response_buf, 4);
- return 0;
+ return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_echo_rsp));
}
static int smb2_rename(struct ksmbd_work *work,
@@ -5656,7 +5671,7 @@ static int set_file_basic_info(struct ksmbd_file *fp,
if (file_info->ChangeTime)
attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime);
else
- attrs.ia_ctime = inode->i_ctime;
+ attrs.ia_ctime = inode_get_ctime(inode);
if (file_info->LastWriteTime) {
attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime);
@@ -5701,7 +5716,7 @@ static int set_file_basic_info(struct ksmbd_file *fp,
return -EACCES;
inode_lock(inode);
- inode->i_ctime = attrs.ia_ctime;
+ inode_set_ctime_to_ts(inode, attrs.ia_ctime);
attrs.ia_valid &= ~ATTR_CTIME;
rc = notify_change(idmap, dentry, &attrs, NULL);
inode_unlock(inode);
@@ -6068,7 +6083,10 @@ int smb2_set_info(struct ksmbd_work *work)
goto err_out;
rsp->StructureSize = cpu_to_le16(2);
- inc_rfc1001_len(work->response_buf, 2);
+ rc = ksmbd_iov_pin_rsp(work, (void *)rsp,
+ sizeof(struct smb2_set_info_rsp));
+ if (rc)
+ goto err_out;
ksmbd_fd_put(work, fp);
return 0;
@@ -6115,28 +6133,36 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
id = req->VolatileFileId;
- inc_rfc1001_len(work->response_buf, 16);
rpc_resp = ksmbd_rpc_read(work->sess, id);
if (rpc_resp) {
+ void *aux_payload_buf;
+
if (rpc_resp->flags != KSMBD_RPC_OK) {
err = -EINVAL;
goto out;
}
- work->aux_payload_buf =
+ aux_payload_buf =
kvmalloc(rpc_resp->payload_sz, GFP_KERNEL);
- if (!work->aux_payload_buf) {
+ if (!aux_payload_buf) {
err = -ENOMEM;
goto out;
}
- memcpy(work->aux_payload_buf, rpc_resp->payload,
- rpc_resp->payload_sz);
+ memcpy(aux_payload_buf, rpc_resp->payload, rpc_resp->payload_sz);
nbytes = rpc_resp->payload_sz;
- work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4;
- work->aux_payload_sz = nbytes;
+ err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
+ offsetof(struct smb2_read_rsp, Buffer),
+ aux_payload_buf, nbytes);
+ if (err)
+ goto out;
kvfree(rpc_resp);
+ } else {
+ err = ksmbd_iov_pin_rsp(work, (void *)rsp,
+ offsetof(struct smb2_read_rsp, Buffer));
+ if (err)
+ goto out;
}
rsp->StructureSize = cpu_to_le16(17);
@@ -6145,7 +6171,6 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
rsp->DataLength = cpu_to_le32(nbytes);
rsp->DataRemaining = 0;
rsp->Flags = 0;
- inc_rfc1001_len(work->response_buf, nbytes);
return 0;
out:
@@ -6219,13 +6244,8 @@ int smb2_read(struct ksmbd_work *work)
int err = 0;
bool is_rdma_channel = false;
unsigned int max_read_size = conn->vals->max_read_size;
-
- WORK_BUFFERS(work, req, rsp);
- if (work->next_smb2_rcv_hdr_off) {
- work->send_no_response = 1;
- err = -EOPNOTSUPP;
- goto out;
- }
+ unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
+ void *aux_payload_buf;
if (test_share_config_flag(work->tcon->share_conf,
KSMBD_SHARE_FLAG_PIPE)) {
@@ -6233,6 +6253,25 @@ int smb2_read(struct ksmbd_work *work)
return smb2_read_pipe(work);
}
+ if (work->next_smb2_rcv_hdr_off) {
+ req = ksmbd_req_buf_next(work);
+ rsp = ksmbd_resp_buf_next(work);
+ if (!has_file_id(req->VolatileFileId)) {
+ ksmbd_debug(SMB, "Compound request set FID = %llu\n",
+ work->compound_fid);
+ id = work->compound_fid;
+ pid = work->compound_pfid;
+ }
+ } else {
+ req = smb2_get_msg(work->request_buf);
+ rsp = smb2_get_msg(work->response_buf);
+ }
+
+ if (!has_file_id(id)) {
+ id = req->VolatileFileId;
+ pid = req->PersistentFileId;
+ }
+
if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE ||
req->Channel == SMB2_CHANNEL_RDMA_V1) {
is_rdma_channel = true;
@@ -6255,7 +6294,7 @@ int smb2_read(struct ksmbd_work *work)
goto out;
}
- fp = ksmbd_lookup_fd_slow(work, req->VolatileFileId, req->PersistentFileId);
+ fp = ksmbd_lookup_fd_slow(work, id, pid);
if (!fp) {
err = -ENOENT;
goto out;
@@ -6281,21 +6320,20 @@ int smb2_read(struct ksmbd_work *work)
ksmbd_debug(SMB, "filename %pD, offset %lld, len %zu\n",
fp->filp, offset, length);
- work->aux_payload_buf = kvzalloc(length, GFP_KERNEL);
- if (!work->aux_payload_buf) {
+ aux_payload_buf = kvzalloc(length, GFP_KERNEL);
+ if (!aux_payload_buf) {
err = -ENOMEM;
goto out;
}
- nbytes = ksmbd_vfs_read(work, fp, length, &offset);
+ nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf);
if (nbytes < 0) {
err = nbytes;
goto out;
}
if ((nbytes == 0 && length != 0) || nbytes < mincount) {
- kvfree(work->aux_payload_buf);
- work->aux_payload_buf = NULL;
+ kvfree(aux_payload_buf);
rsp->hdr.Status = STATUS_END_OF_FILE;
smb2_set_err_rsp(work);
ksmbd_fd_put(work, fp);
@@ -6308,11 +6346,10 @@ int smb2_read(struct ksmbd_work *work)
if (is_rdma_channel == true) {
/* write data to the client using rdma channel */
remain_bytes = smb2_read_rdma_channel(work, req,
- work->aux_payload_buf,
+ aux_payload_buf,
nbytes);
- kvfree(work->aux_payload_buf);
- work->aux_payload_buf = NULL;
-
+ kvfree(aux_payload_buf);
+ aux_payload_buf = NULL;
nbytes = 0;
if (remain_bytes < 0) {
err = (int)remain_bytes;
@@ -6326,10 +6363,11 @@ int smb2_read(struct ksmbd_work *work)
rsp->DataLength = cpu_to_le32(nbytes);
rsp->DataRemaining = cpu_to_le32(remain_bytes);
rsp->Flags = 0;
- inc_rfc1001_len(work->response_buf, 16);
- work->resp_hdr_sz = get_rfc1002_len(work->response_buf) + 4;
- work->aux_payload_sz = nbytes;
- inc_rfc1001_len(work->response_buf, nbytes);
+ err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
+ offsetof(struct smb2_read_rsp, Buffer),
+ aux_payload_buf, nbytes);
+ if (err)
+ goto out;
ksmbd_fd_put(work, fp);
return 0;
@@ -6412,8 +6450,8 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work)
rsp->DataLength = cpu_to_le32(length);
rsp->DataRemaining = 0;
rsp->Reserved2 = 0;
- inc_rfc1001_len(work->response_buf, 16);
- return 0;
+ err = ksmbd_iov_pin_rsp(work, (void *)rsp,
+ offsetof(struct smb2_write_rsp, Buffer));
out:
if (err) {
rsp->hdr.Status = STATUS_INVALID_HANDLE;
@@ -6569,7 +6607,9 @@ int smb2_write(struct ksmbd_work *work)
rsp->DataLength = cpu_to_le32(nbytes);
rsp->DataRemaining = 0;
rsp->Reserved2 = 0;
- inc_rfc1001_len(work->response_buf, 16);
+ err = ksmbd_iov_pin_rsp(work, rsp, offsetof(struct smb2_write_rsp, Buffer));
+ if (err)
+ goto out;
ksmbd_fd_put(work, fp);
return 0;
@@ -6616,15 +6656,11 @@ int smb2_flush(struct ksmbd_work *work)
rsp->StructureSize = cpu_to_le16(4);
rsp->Reserved = 0;
- inc_rfc1001_len(work->response_buf, 4);
- return 0;
+ return ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_flush_rsp));
out:
- if (err) {
- rsp->hdr.Status = STATUS_INVALID_HANDLE;
- smb2_set_err_rsp(work);
- }
-
+ rsp->hdr.Status = STATUS_INVALID_HANDLE;
+ smb2_set_err_rsp(work);
return err;
}
@@ -7029,10 +7065,6 @@ skip:
ksmbd_debug(SMB,
"would have to wait for getting lock\n");
- spin_lock(&work->conn->llist_lock);
- list_add_tail(&smb_lock->clist,
- &work->conn->lock_list);
- spin_unlock(&work->conn->llist_lock);
list_add(&smb_lock->llist, &rollback_list);
argv = kmalloc(sizeof(void *), GFP_KERNEL);
@@ -7063,9 +7095,6 @@ skip:
if (work->state != KSMBD_WORK_ACTIVE) {
list_del(&smb_lock->llist);
- spin_lock(&work->conn->llist_lock);
- list_del(&smb_lock->clist);
- spin_unlock(&work->conn->llist_lock);
locks_free_lock(flock);
if (work->state == KSMBD_WORK_CANCELLED) {
@@ -7078,8 +7107,6 @@ skip:
goto out;
}
- init_smb2_rsp_hdr(work);
- smb2_set_err_rsp(work);
rsp->hdr.Status =
STATUS_RANGE_NOT_LOCKED;
kfree(smb_lock);
@@ -7087,19 +7114,16 @@ skip:
}
list_del(&smb_lock->llist);
- spin_lock(&work->conn->llist_lock);
- list_del(&smb_lock->clist);
- spin_unlock(&work->conn->llist_lock);
release_async_work(work);
goto retry;
} else if (!rc) {
+ list_add(&smb_lock->llist, &rollback_list);
spin_lock(&work->conn->llist_lock);
list_add_tail(&smb_lock->clist,
&work->conn->lock_list);
list_add_tail(&smb_lock->flist,
&fp->lock_list);
spin_unlock(&work->conn->llist_lock);
- list_add(&smb_lock->llist, &rollback_list);
ksmbd_debug(SMB, "successful in taking lock\n");
} else {
goto out;
@@ -7114,7 +7138,10 @@ skip:
ksmbd_debug(SMB, "successful in taking lock\n");
rsp->hdr.Status = STATUS_SUCCESS;
rsp->Reserved = 0;
- inc_rfc1001_len(work->response_buf, 4);
+ err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lock_rsp));
+ if (err)
+ goto out;
+
ksmbd_fd_put(work, fp);
return 0;
@@ -7910,9 +7937,9 @@ dup_ext_out:
rsp->Reserved = cpu_to_le16(0);
rsp->Flags = cpu_to_le32(0);
rsp->Reserved2 = cpu_to_le32(0);
- inc_rfc1001_len(work->response_buf, 48 + nbytes);
-
- return 0;
+ ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_ioctl_rsp) + nbytes);
+ if (!ret)
+ return ret;
out:
if (ret == -EACCES)
@@ -8036,10 +8063,10 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
goto err_out;
}
- opinfo_put(opinfo);
- ksmbd_fd_put(work, fp);
opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
+ opinfo_put(opinfo);
+ ksmbd_fd_put(work, fp);
rsp->StructureSize = cpu_to_le16(24);
rsp->OplockLevel = rsp_oplevel;
@@ -8047,8 +8074,9 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
rsp->Reserved2 = 0;
rsp->VolatileFid = volatile_id;
rsp->PersistentFid = persistent_id;
- inc_rfc1001_len(work->response_buf, 24);
- return;
+ ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break));
+ if (!ret)
+ return;
err_out:
opinfo->op_state = OPLOCK_STATE_NONE;
@@ -8198,8 +8226,9 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
memcpy(rsp->LeaseKey, req->LeaseKey, 16);
rsp->LeaseState = lease_state;
rsp->LeaseDuration = 0;
- inc_rfc1001_len(work->response_buf, 36);
- return;
+ ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack));
+ if (!ret)
+ return;
err_out:
opinfo->op_state = OPLOCK_STATE_NONE;
@@ -8337,43 +8366,19 @@ int smb2_check_sign_req(struct ksmbd_work *work)
void smb2_set_sign_rsp(struct ksmbd_work *work)
{
struct smb2_hdr *hdr;
- struct smb2_hdr *req_hdr;
char signature[SMB2_HMACSHA256_SIZE];
- struct kvec iov[2];
- size_t len;
+ struct kvec *iov;
int n_vec = 1;
- hdr = smb2_get_msg(work->response_buf);
- if (work->next_smb2_rsp_hdr_off)
- hdr = ksmbd_resp_buf_next(work);
-
- req_hdr = ksmbd_req_buf_next(work);
-
- if (!work->next_smb2_rsp_hdr_off) {
- len = get_rfc1002_len(work->response_buf);
- if (req_hdr->NextCommand)
- len = ALIGN(len, 8);
- } else {
- len = get_rfc1002_len(work->response_buf) -
- work->next_smb2_rsp_hdr_off;
- len = ALIGN(len, 8);
- }
-
- if (req_hdr->NextCommand)
- hdr->NextCommand = cpu_to_le32(len);
-
+ hdr = ksmbd_resp_buf_curr(work);
hdr->Flags |= SMB2_FLAGS_SIGNED;
memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE);
- iov[0].iov_base = (char *)&hdr->ProtocolId;
- iov[0].iov_len = len;
-
- if (work->aux_payload_sz) {
- iov[0].iov_len -= work->aux_payload_sz;
-
- iov[1].iov_base = work->aux_payload_buf;
- iov[1].iov_len = work->aux_payload_sz;
+ if (hdr->Command == SMB2_READ) {
+ iov = &work->iov[work->iov_idx - 1];
n_vec++;
+ } else {
+ iov = &work->iov[work->iov_idx];
}
if (!ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, n_vec,
@@ -8449,29 +8454,14 @@ int smb3_check_sign_req(struct ksmbd_work *work)
void smb3_set_sign_rsp(struct ksmbd_work *work)
{
struct ksmbd_conn *conn = work->conn;
- struct smb2_hdr *req_hdr, *hdr;
+ struct smb2_hdr *hdr;
struct channel *chann;
char signature[SMB2_CMACAES_SIZE];
- struct kvec iov[2];
+ struct kvec *iov;
int n_vec = 1;
- size_t len;
char *signing_key;
- hdr = smb2_get_msg(work->response_buf);
- if (work->next_smb2_rsp_hdr_off)
- hdr = ksmbd_resp_buf_next(work);
-
- req_hdr = ksmbd_req_buf_next(work);
-
- if (!work->next_smb2_rsp_hdr_off) {
- len = get_rfc1002_len(work->response_buf);
- if (req_hdr->NextCommand)
- len = ALIGN(len, 8);
- } else {
- len = get_rfc1002_len(work->response_buf) -
- work->next_smb2_rsp_hdr_off;
- len = ALIGN(len, 8);
- }
+ hdr = ksmbd_resp_buf_curr(work);
if (conn->binding == false &&
le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) {
@@ -8487,21 +8477,18 @@ void smb3_set_sign_rsp(struct ksmbd_work *work)
if (!signing_key)
return;
- if (req_hdr->NextCommand)
- hdr->NextCommand = cpu_to_le32(len);
-
hdr->Flags |= SMB2_FLAGS_SIGNED;
memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE);
- iov[0].iov_base = (char *)&hdr->ProtocolId;
- iov[0].iov_len = len;
- if (work->aux_payload_sz) {
- iov[0].iov_len -= work->aux_payload_sz;
- iov[1].iov_base = work->aux_payload_buf;
- iov[1].iov_len = work->aux_payload_sz;
+
+ if (hdr->Command == SMB2_READ) {
+ iov = &work->iov[work->iov_idx - 1];
n_vec++;
+ } else {
+ iov = &work->iov[work->iov_idx];
}
- if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, signature))
+ if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec,
+ signature))
memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE);
}
@@ -8568,45 +8555,22 @@ static void fill_transform_hdr(void *tr_buf, char *old_buf, __le16 cipher_type)
int smb3_encrypt_resp(struct ksmbd_work *work)
{
- char *buf = work->response_buf;
- struct kvec iov[3];
+ struct kvec *iov = work->iov;
int rc = -ENOMEM;
- int buf_size = 0, rq_nvec = 2 + (work->aux_payload_sz ? 1 : 0);
+ void *tr_buf;
- if (ARRAY_SIZE(iov) < rq_nvec)
- return -ENOMEM;
-
- work->tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL);
- if (!work->tr_buf)
+ tr_buf = kzalloc(sizeof(struct smb2_transform_hdr) + 4, GFP_KERNEL);
+ if (!tr_buf)
return rc;
/* fill transform header */
- fill_transform_hdr(work->tr_buf, buf, work->conn->cipher_type);
+ fill_transform_hdr(tr_buf, work->response_buf, work->conn->cipher_type);
- iov[0].iov_base = work->tr_buf;
+ iov[0].iov_base = tr_buf;
iov[0].iov_len = sizeof(struct smb2_transform_hdr) + 4;
- buf_size += iov[0].iov_len - 4;
-
- iov[1].iov_base = buf + 4;
- iov[1].iov_len = get_rfc1002_len(buf);
- if (work->aux_payload_sz) {
- iov[1].iov_len = work->resp_hdr_sz - 4;
+ work->tr_buf = tr_buf;
- iov[2].iov_base = work->aux_payload_buf;
- iov[2].iov_len = work->aux_payload_sz;
- buf_size += iov[2].iov_len;
- }
- buf_size += iov[1].iov_len;
- work->resp_hdr_sz = iov[1].iov_len;
-
- rc = ksmbd_crypt_message(work, iov, rq_nvec, 1);
- if (rc)
- return rc;
-
- memmove(buf, iov[1].iov_base, iov[1].iov_len);
- *(__be32 *)work->tr_buf = cpu_to_be32(buf_size);
-
- return rc;
+ return ksmbd_crypt_message(work, iov, work->iov_idx + 1, 1);
}
bool smb3_is_transform_hdr(void *buf)
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 2767c08a534a..d12cfd3b0927 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -361,7 +361,7 @@ struct smb2_ea_info {
__u8 Flags;
__u8 EaNameLength;
__le16 EaValueLength;
- char name[1];
+ char name[];
/* optionally followed by value */
} __packed; /* level 15 Query */
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index c2b75d898852..e6ba1e9b8589 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -319,12 +319,6 @@ static int init_smb1_rsp_hdr(struct ksmbd_work *work)
struct smb_hdr *rsp_hdr = (struct smb_hdr *)work->response_buf;
struct smb_hdr *rcv_hdr = (struct smb_hdr *)work->request_buf;
- /*
- * Remove 4 byte direct TCP header.
- */
- *(__be32 *)work->response_buf =
- cpu_to_be32(sizeof(struct smb_hdr) - 4);
-
rsp_hdr->Command = SMB_COM_NEGOTIATE;
*(__le32 *)rsp_hdr->Protocol = SMB1_PROTO_NUMBER;
rsp_hdr->Flags = SMBFLG_RESPONSE;
@@ -560,10 +554,11 @@ static int smb_handle_negotiate(struct ksmbd_work *work)
ksmbd_debug(SMB, "Unsupported SMB1 protocol\n");
- /* Add 2 byte bcc and 2 byte DialectIndex. */
- inc_rfc1001_len(work->response_buf, 4);
- neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS;
+ if (ksmbd_iov_pin_rsp(work, (void *)neg_rsp,
+ sizeof(struct smb_negotiate_rsp) - 4))
+ return -ENOMEM;
+ neg_rsp->hdr.Status.CifsError = STATUS_SUCCESS;
neg_rsp->hdr.WordCount = 1;
neg_rsp->DialectIndex = cpu_to_le16(work->conn->dialect);
neg_rsp->ByteCount = 0;
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index e5e438bf5499..6c0305be895e 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -1420,7 +1420,6 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon,
out:
posix_acl_release(fattr.cf_acls);
posix_acl_release(fattr.cf_dacls);
- mark_inode_dirty(inode);
return rc;
}
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index c06efc020bd9..3b269e1f523a 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -1241,14 +1241,12 @@ static int smb_direct_writev(struct ksmbd_transport *t,
//FIXME: skip RFC1002 header..
buflen -= 4;
- iov[0].iov_base += 4;
- iov[0].iov_len -= 4;
remaining_data_length = buflen;
ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen);
smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key);
- start = i = 0;
+ start = i = 1;
buflen = 0;
while (true) {
buflen += iov[i].iov_len;
@@ -1366,24 +1364,35 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
LIST_HEAD(msg_list);
char *desc_buf;
int credits_needed;
- unsigned int desc_buf_len;
- size_t total_length = 0;
+ unsigned int desc_buf_len, desc_num = 0;
if (t->status != SMB_DIRECT_CS_CONNECTED)
return -ENOTCONN;
+ if (buf_len > t->max_rdma_rw_size)
+ return -EINVAL;
+
/* calculate needed credits */
credits_needed = 0;
desc_buf = buf;
for (i = 0; i < desc_len / sizeof(*desc); i++) {
+ if (!buf_len)
+ break;
+
desc_buf_len = le32_to_cpu(desc[i].length);
+ if (!desc_buf_len)
+ return -EINVAL;
+
+ if (desc_buf_len > buf_len) {
+ desc_buf_len = buf_len;
+ desc[i].length = cpu_to_le32(desc_buf_len);
+ buf_len = 0;
+ }
credits_needed += calc_rw_credits(t, desc_buf, desc_buf_len);
desc_buf += desc_buf_len;
- total_length += desc_buf_len;
- if (desc_buf_len == 0 || total_length > buf_len ||
- total_length > t->max_rdma_rw_size)
- return -EINVAL;
+ buf_len -= desc_buf_len;
+ desc_num++;
}
ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n",
@@ -1395,7 +1404,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
/* build rdma_rw_ctx for each descriptor */
desc_buf = buf;
- for (i = 0; i < desc_len / sizeof(*desc); i++) {
+ for (i = 0; i < desc_num; i++) {
msg = kzalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) +
sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL);
if (!msg) {
diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c
index 9ae676906ed3..393dd4a7432b 100644
--- a/fs/smb/server/unicode.c
+++ b/fs/smb/server/unicode.c
@@ -11,7 +11,6 @@
#include <asm/unaligned.h>
#include "glob.h"
#include "unicode.h"
-#include "uniupr.h"
#include "smb_common.h"
/*
diff --git a/fs/smb/server/unicode.h b/fs/smb/server/unicode.h
index 076f6034a789..28c7c736f7bb 100644
--- a/fs/smb/server/unicode.h
+++ b/fs/smb/server/unicode.h
@@ -18,49 +18,14 @@
* This is a compressed table of upper and lower case conversion.
*
*/
-#ifndef _CIFS_UNICODE_H
-#define _CIFS_UNICODE_H
+#ifndef _SMB_UNICODE_H
+#define _SMB_UNICODE_H
#include <asm/byteorder.h>
#include <linux/types.h>
#include <linux/nls.h>
#include <linux/unicode.h>
-
-#define UNIUPR_NOLOWER /* Example to not expand lower case tables */
-
-/*
- * Windows maps these to the user defined 16 bit Unicode range since they are
- * reserved symbols (along with \ and /), otherwise illegal to store
- * in filenames in NTFS
- */
-#define UNI_ASTERISK ((__u16)('*' + 0xF000))
-#define UNI_QUESTION ((__u16)('?' + 0xF000))
-#define UNI_COLON ((__u16)(':' + 0xF000))
-#define UNI_GRTRTHAN ((__u16)('>' + 0xF000))
-#define UNI_LESSTHAN ((__u16)('<' + 0xF000))
-#define UNI_PIPE ((__u16)('|' + 0xF000))
-#define UNI_SLASH ((__u16)('\\' + 0xF000))
-
-/* Just define what we want from uniupr.h. We don't want to define the tables
- * in each source file.
- */
-#ifndef UNICASERANGE_DEFINED
-struct UniCaseRange {
- wchar_t start;
- wchar_t end;
- signed char *table;
-};
-#endif /* UNICASERANGE_DEFINED */
-
-#ifndef UNIUPR_NOUPPER
-extern signed char SmbUniUpperTable[512];
-extern const struct UniCaseRange SmbUniUpperRange[];
-#endif /* UNIUPR_NOUPPER */
-
-#ifndef UNIUPR_NOLOWER
-extern signed char CifsUniLowerTable[512];
-extern const struct UniCaseRange CifsUniLowerRange[];
-#endif /* UNIUPR_NOLOWER */
+#include "../../nls/nls_ucs2_utils.h"
#ifdef __KERNEL__
int smb_strtoUTF16(__le16 *to, const char *from, int len,
@@ -73,286 +38,4 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
char *ksmbd_extract_sharename(struct unicode_map *um, const char *treename);
#endif
-/*
- * UniStrcat: Concatenate the second string to the first
- *
- * Returns:
- * Address of the first string
- */
-static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2)
-{
- wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */
-
- while (*ucs1++)
- /*NULL*/; /* To end of first string */
- ucs1--; /* Return to the null */
- while ((*ucs1++ = *ucs2++))
- /*NULL*/; /* copy string 2 over */
- return anchor;
-}
-
-/*
- * UniStrchr: Find a character in a string
- *
- * Returns:
- * Address of first occurrence of character in string
- * or NULL if the character is not in the string
- */
-static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc)
-{
- while ((*ucs != uc) && *ucs)
- ucs++;
-
- if (*ucs == uc)
- return (wchar_t *)ucs;
- return NULL;
-}
-
-/*
- * UniStrcmp: Compare two strings
- *
- * Returns:
- * < 0: First string is less than second
- * = 0: Strings are equal
- * > 0: First string is greater than second
- */
-static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2)
-{
- while ((*ucs1 == *ucs2) && *ucs1) {
- ucs1++;
- ucs2++;
- }
- return (int)*ucs1 - (int)*ucs2;
-}
-
-/*
- * UniStrcpy: Copy a string
- */
-static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2)
-{
- wchar_t *anchor = ucs1; /* save the start of result string */
-
- while ((*ucs1++ = *ucs2++))
- /*NULL*/;
- return anchor;
-}
-
-/*
- * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes)
- */
-static inline size_t UniStrlen(const wchar_t *ucs1)
-{
- int i = 0;
-
- while (*ucs1++)
- i++;
- return i;
-}
-
-/*
- * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a
- * string (length limited)
- */
-static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen)
-{
- int i = 0;
-
- while (*ucs1++) {
- i++;
- if (i >= maxlen)
- break;
- }
- return i;
-}
-
-/*
- * UniStrncat: Concatenate length limited string
- */
-static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- wchar_t *anchor = ucs1; /* save pointer to string 1 */
-
- while (*ucs1++)
- /*NULL*/;
- ucs1--; /* point to null terminator of s1 */
- while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */
- ucs1++;
- ucs2++;
- }
- *ucs1 = 0; /* Null terminate the result */
- return anchor;
-}
-
-/*
- * UniStrncmp: Compare length limited string
- */
-static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- if (!n)
- return 0; /* Null strings are equal */
- while ((*ucs1 == *ucs2) && *ucs1 && --n) {
- ucs1++;
- ucs2++;
- }
- return (int)*ucs1 - (int)*ucs2;
-}
-
-/*
- * UniStrncmp_le: Compare length limited string - native to little-endian
- */
-static inline int
-UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- if (!n)
- return 0; /* Null strings are equal */
- while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) {
- ucs1++;
- ucs2++;
- }
- return (int)*ucs1 - (int)__le16_to_cpu(*ucs2);
-}
-
-/*
- * UniStrncpy: Copy length limited string with pad
- */
-static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- wchar_t *anchor = ucs1;
-
- while (n-- && *ucs2) /* Copy the strings */
- *ucs1++ = *ucs2++;
-
- n++;
- while (n--) /* Pad with nulls */
- *ucs1++ = 0;
- return anchor;
-}
-
-/*
- * UniStrncpy_le: Copy length limited string with pad to little-endian
- */
-static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n)
-{
- wchar_t *anchor = ucs1;
-
- while (n-- && *ucs2) /* Copy the strings */
- *ucs1++ = __le16_to_cpu(*ucs2++);
-
- n++;
- while (n--) /* Pad with nulls */
- *ucs1++ = 0;
- return anchor;
-}
-
-/*
- * UniStrstr: Find a string in a string
- *
- * Returns:
- * Address of first match found
- * NULL if no matching string is found
- */
-static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2)
-{
- const wchar_t *anchor1 = ucs1;
- const wchar_t *anchor2 = ucs2;
-
- while (*ucs1) {
- if (*ucs1 == *ucs2) {
- /* Partial match found */
- ucs1++;
- ucs2++;
- } else {
- if (!*ucs2) /* Match found */
- return (wchar_t *)anchor1;
- ucs1 = ++anchor1; /* No match */
- ucs2 = anchor2;
- }
- }
-
- if (!*ucs2) /* Both end together */
- return (wchar_t *)anchor1; /* Match found */
- return NULL; /* No match */
-}
-
-#ifndef UNIUPR_NOUPPER
-/*
- * UniToupper: Convert a unicode character to upper case
- */
-static inline wchar_t UniToupper(register wchar_t uc)
-{
- register const struct UniCaseRange *rp;
-
- if (uc < sizeof(SmbUniUpperTable)) {
- /* Latin characters */
- return uc + SmbUniUpperTable[uc]; /* Use base tables */
- }
-
- rp = SmbUniUpperRange; /* Use range tables */
- while (rp->start) {
- if (uc < rp->start) /* Before start of range */
- return uc; /* Uppercase = input */
- if (uc <= rp->end) /* In range */
- return uc + rp->table[uc - rp->start];
- rp++; /* Try next range */
- }
- return uc; /* Past last range */
-}
-
-/*
- * UniStrupr: Upper case a unicode string
- */
-static inline __le16 *UniStrupr(register __le16 *upin)
-{
- register __le16 *up;
-
- up = upin;
- while (*up) { /* For all characters */
- *up = cpu_to_le16(UniToupper(le16_to_cpu(*up)));
- up++;
- }
- return upin; /* Return input pointer */
-}
-#endif /* UNIUPR_NOUPPER */
-
-#ifndef UNIUPR_NOLOWER
-/*
- * UniTolower: Convert a unicode character to lower case
- */
-static inline wchar_t UniTolower(register wchar_t uc)
-{
- register const struct UniCaseRange *rp;
-
- if (uc < sizeof(CifsUniLowerTable)) {
- /* Latin characters */
- return uc + CifsUniLowerTable[uc]; /* Use base tables */
- }
-
- rp = CifsUniLowerRange; /* Use range tables */
- while (rp->start) {
- if (uc < rp->start) /* Before start of range */
- return uc; /* Uppercase = input */
- if (uc <= rp->end) /* In range */
- return uc + rp->table[uc - rp->start];
- rp++; /* Try next range */
- }
- return uc; /* Past last range */
-}
-
-/*
- * UniStrlwr: Lower case a unicode string
- */
-static inline wchar_t *UniStrlwr(register wchar_t *upin)
-{
- register wchar_t *up;
-
- up = upin;
- while (*up) { /* For all characters */
- *up = UniTolower(*up);
- up++;
- }
- return upin; /* Return input pointer */
-}
-
-#endif
-
-#endif /* _CIFS_UNICODE_H */
+#endif /* _SMB_UNICODE_H */
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 3d5d652153a5..b5a5e50fc9ca 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -367,15 +367,15 @@ out:
* @fid: file id of open file
* @count: read byte count
* @pos: file pos
+ * @rbuf: read data buffer
*
* Return: number of read bytes on success, otherwise error
*/
int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
- loff_t *pos)
+ loff_t *pos, char *rbuf)
{
struct file *filp = fp->filp;
ssize_t nbytes = 0;
- char *rbuf = work->aux_payload_buf;
struct inode *inode = file_inode(filp);
if (S_ISDIR(inode->i_mode))
@@ -1659,7 +1659,8 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
u64 time;
int rc;
- generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat);
+ generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry),
+ ksmbd_kstat->kstat);
time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime);
ksmbd_kstat->create_time = time;
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index 72f9fb4b48d1..00968081856e 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -76,8 +76,8 @@ void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
struct dentry *dentry, __le32 *daccess);
int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode);
int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode);
-int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp,
- size_t count, loff_t *pos);
+int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count,
+ loff_t *pos, char *rbuf);
int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
char *buf, size_t count, loff_t *pos, bool sync,
ssize_t *written);
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index f41f8d6108ce..c91eac6514dd 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -106,7 +106,7 @@ int ksmbd_query_inode_status(struct inode *inode)
ci = __ksmbd_inode_lookup(inode);
if (ci) {
ret = KSMBD_INODE_STATUS_OK;
- if (ci->m_flags & S_DEL_PENDING)
+ if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS))
ret = KSMBD_INODE_STATUS_PENDING_DELETE;
atomic_dec(&ci->m_count);
}
@@ -116,7 +116,7 @@ int ksmbd_query_inode_status(struct inode *inode)
bool ksmbd_inode_pending_delete(struct ksmbd_file *fp)
{
- return (fp->f_ci->m_flags & S_DEL_PENDING);
+ return (fp->f_ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS));
}
void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp)
@@ -333,6 +333,9 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
static struct ksmbd_file *ksmbd_fp_get(struct ksmbd_file *fp)
{
+ if (fp->f_state != FP_INITED)
+ return NULL;
+
if (!atomic_inc_not_zero(&fp->refcount))
return NULL;
return fp;
@@ -382,15 +385,20 @@ int ksmbd_close_fd(struct ksmbd_work *work, u64 id)
return 0;
ft = &work->sess->file_table;
- read_lock(&ft->lock);
+ write_lock(&ft->lock);
fp = idr_find(ft->idr, id);
if (fp) {
set_close_state_blocked_works(fp);
- if (!atomic_dec_and_test(&fp->refcount))
+ if (fp->f_state != FP_INITED)
fp = NULL;
+ else {
+ fp->f_state = FP_CLOSED;
+ if (!atomic_dec_and_test(&fp->refcount))
+ fp = NULL;
+ }
}
- read_unlock(&ft->lock);
+ write_unlock(&ft->lock);
if (!fp)
return -EINVAL;
@@ -570,6 +578,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp)
fp->tcon = work->tcon;
fp->volatile_id = KSMBD_NO_FID;
fp->persistent_id = KSMBD_NO_FID;
+ fp->f_state = FP_NEW;
fp->f_ci = ksmbd_inode_get(fp);
if (!fp->f_ci) {
@@ -591,6 +600,17 @@ err_out:
return ERR_PTR(ret);
}
+void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
+ unsigned int state)
+{
+ if (!fp)
+ return;
+
+ write_lock(&ft->lock);
+ fp->f_state = state;
+ write_unlock(&ft->lock);
+}
+
static int
__close_file_table_ids(struct ksmbd_file_table *ft,
struct ksmbd_tree_connect *tcon,
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index fcb13413fa8d..03d0bf941216 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -60,6 +60,12 @@ struct ksmbd_inode {
__le32 m_fattr;
};
+enum {
+ FP_NEW = 0,
+ FP_INITED,
+ FP_CLOSED
+};
+
struct ksmbd_file {
struct file *filp;
u64 persistent_id;
@@ -98,6 +104,7 @@ struct ksmbd_file {
/* if ls is happening on directory, below is valid*/
struct ksmbd_readdir_data readdir_data;
int dot_dotdot[2];
+ unsigned int f_state;
};
static inline void set_ctx_actor(struct dir_context *ctx,
@@ -142,6 +149,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode);
int ksmbd_init_global_file_table(void);
void ksmbd_free_global_file_table(void);
void ksmbd_set_fd_limit(unsigned long limit);
+void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp,
+ unsigned int state);
/*
* INODE hash
diff --git a/fs/splice.c b/fs/splice.c
index 3e2a31e1ce6a..d983d375ff11 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
*/
folio_wait_writeback(folio);
- if (folio_has_private(folio) &&
- !filemap_release_folio(folio, GFP_KERNEL))
+ if (!filemap_release_folio(folio, GFP_KERNEL))
goto out_unlock;
/*
@@ -120,17 +119,17 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
- struct page *page = buf->page;
+ struct folio *folio = page_folio(buf->page);
int err;
- if (!PageUptodate(page)) {
- lock_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_lock(folio);
/*
- * Page got truncated/unhashed. This will cause a 0-byte
+ * Folio got truncated/unhashed. This will cause a 0-byte
* splice, if this is the first page.
*/
- if (!page->mapping) {
+ if (!folio->mapping) {
err = -ENODATA;
goto error;
}
@@ -138,20 +137,18 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
/*
* Uh oh, read-error from disk.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
err = -EIO;
goto error;
}
- /*
- * Page is ok afterall, we are done.
- */
- unlock_page(page);
+ /* Folio is ok after all, we are done */
+ folio_unlock(folio);
}
return 0;
error:
- unlock_page(page);
+ folio_unlock(folio);
return err;
}
@@ -1269,10 +1266,8 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
if ((in->f_flags | out->f_flags) & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;
- return splice_pipe_to_pipe(ipipe, opipe, len, flags);
- }
-
- if (ipipe) {
+ ret = splice_pipe_to_pipe(ipipe, opipe, len, flags);
+ } else if (ipipe) {
if (off_in)
return -ESPIPE;
if (off_out) {
@@ -1297,18 +1292,11 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
ret = do_splice_from(ipipe, out, &offset, len, flags);
file_end_write(out);
- if (ret > 0)
- fsnotify_modify(out);
-
if (!off_out)
out->f_pos = offset;
else
*off_out = offset;
-
- return ret;
- }
-
- if (opipe) {
+ } else if (opipe) {
if (off_out)
return -ESPIPE;
if (off_in) {
@@ -1324,18 +1312,25 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
- if (ret > 0)
- fsnotify_access(in);
-
if (!off_in)
in->f_pos = offset;
else
*off_in = offset;
+ } else {
+ ret = -EINVAL;
+ }
- return ret;
+ if (ret > 0) {
+ /*
+ * Generate modify out before access in:
+ * do_splice_from() may've already sent modify out,
+ * and this ensures the events get merged.
+ */
+ fsnotify_modify(out);
+ fsnotify_access(in);
}
- return -EINVAL;
+ return ret;
}
static long __do_splice(struct file *in, loff_t __user *off_in,
@@ -1464,6 +1459,9 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
pipe_unlock(pipe);
}
+ if (ret > 0)
+ fsnotify_access(file);
+
return ret;
}
@@ -1493,8 +1491,10 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
if (!ret)
ret = iter_to_pipe(iter, pipe, buf_flag);
pipe_unlock(pipe);
- if (ret > 0)
+ if (ret > 0) {
wakeup_pipe_readers(pipe);
+ fsnotify_modify(file);
+ }
return ret;
}
@@ -1928,6 +1928,11 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
}
}
+ if (ret > 0) {
+ fsnotify_access(in);
+ fsnotify_modify(out);
+ }
+
return ret;
}
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 24463145b351..c6e626b00546 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -61,7 +61,7 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode,
inode->i_ino = le32_to_cpu(sqsh_ino->inode_number);
inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime);
inode->i_atime.tv_sec = inode->i_mtime.tv_sec;
- inode->i_ctime.tv_sec = inode->i_mtime.tv_sec;
+ inode_set_ctime(inode, inode->i_mtime.tv_sec, 0);
inode->i_mode = le16_to_cpu(sqsh_ino->mode);
inode->i_size = 0;
diff --git a/fs/stack.c b/fs/stack.c
index c9830924eb12..b5e01bdb5f5f 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -68,7 +68,7 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
dest->i_rdev = src->i_rdev;
dest->i_atime = src->i_atime;
dest->i_mtime = src->i_mtime;
- dest->i_ctime = src->i_ctime;
+ inode_set_ctime_to_ts(dest, inode_get_ctime(src));
dest->i_blkbits = src->i_blkbits;
dest->i_flags = src->i_flags;
set_nlink(dest, src->i_nlink);
diff --git a/fs/stat.c b/fs/stat.c
index 7c238da22ef0..d43a5cc1bfa4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -28,9 +28,10 @@
/**
* generic_fillattr - Fill in the basic attributes from the inode struct
- * @idmap: idmap of the mount the inode was found from
- * @inode: Inode to use as the source
- * @stat: Where to fill in the attributes
+ * @idmap: idmap of the mount the inode was found from
+ * @request_mask: statx request_mask
+ * @inode: Inode to use as the source
+ * @stat: Where to fill in the attributes
*
* Fill in the basic attributes in the kstat structure from data that's to be
* found on the VFS inode structure. This is the default if no getattr inode
@@ -42,8 +43,8 @@
* uid and gid filds. On non-idmapped mounts or if permission checking is to be
* performed on the raw inode simply passs @nop_mnt_idmap.
*/
-void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode,
- struct kstat *stat)
+void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
+ struct inode *inode, struct kstat *stat)
{
vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
@@ -58,9 +59,15 @@ void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode,
stat->size = i_size_read(inode);
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
- stat->ctime = inode->i_ctime;
+ stat->ctime = inode_get_ctime(inode);
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
+
+ if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
+ stat->result_mask |= STATX_CHANGE_COOKIE;
+ stat->change_cookie = inode_query_iversion(inode);
+ }
+
}
EXPORT_SYMBOL(generic_fillattr);
@@ -123,17 +130,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
STATX_ATTR_DAX);
- if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- stat->result_mask |= STATX_CHANGE_COOKIE;
- stat->change_cookie = inode_query_iversion(inode);
- }
-
idmap = mnt_idmap(path->mnt);
if (inode->i_op->getattr)
return inode->i_op->getattr(idmap, path, stat,
request_mask, query_flags);
- generic_fillattr(idmap, inode, stat);
+ generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);
@@ -272,6 +274,23 @@ int vfs_fstatat(int dfd, const char __user *filename,
int statx_flags = flags | AT_NO_AUTOMOUNT;
struct filename *name;
+ /*
+ * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH)
+ *
+ * If AT_EMPTY_PATH is set, we expect the common case to be that
+ * empty path, and avoid doing all the extra pathname work.
+ */
+ if (dfd >= 0 && flags == AT_EMPTY_PATH) {
+ char c;
+
+ ret = get_user(c, filename);
+ if (unlikely(ret))
+ return ret;
+
+ if (likely(!c))
+ return vfs_fstat(dfd, stat);
+ }
+
name = getname_flags(filename, getname_statx_lookup_flags(statx_flags), NULL);
ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
putname(name);
@@ -363,12 +382,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
#ifdef __ARCH_WANT_NEW_STAT
-#if BITS_PER_LONG == 32
-# define choose_32_64(a,b) a
-#else
-# define choose_32_64(a,b) b
-#endif
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
diff --git a/fs/super.c b/fs/super.c
index e781226e2880..2d762ce67f6e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -39,7 +39,7 @@
#include <uapi/linux/mount.h>
#include "internal.h"
-static int thaw_super_locked(struct super_block *sb);
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who);
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
@@ -50,6 +50,130 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
"sb_internal",
};
+static inline void __super_lock(struct super_block *sb, bool excl)
+{
+ if (excl)
+ down_write(&sb->s_umount);
+ else
+ down_read(&sb->s_umount);
+}
+
+static inline void super_unlock(struct super_block *sb, bool excl)
+{
+ if (excl)
+ up_write(&sb->s_umount);
+ else
+ up_read(&sb->s_umount);
+}
+
+static inline void __super_lock_excl(struct super_block *sb)
+{
+ __super_lock(sb, true);
+}
+
+static inline void super_unlock_excl(struct super_block *sb)
+{
+ super_unlock(sb, true);
+}
+
+static inline void super_unlock_shared(struct super_block *sb)
+{
+ super_unlock(sb, false);
+}
+
+static inline bool wait_born(struct super_block *sb)
+{
+ unsigned int flags;
+
+ /*
+ * Pairs with smp_store_release() in super_wake() and ensures
+ * that we see SB_BORN or SB_DYING after we're woken.
+ */
+ flags = smp_load_acquire(&sb->s_flags);
+ return flags & (SB_BORN | SB_DYING);
+}
+
+/**
+ * super_lock - wait for superblock to become ready and lock it
+ * @sb: superblock to wait for
+ * @excl: whether exclusive access is required
+ *
+ * If the superblock has neither passed through vfs_get_tree() or
+ * generic_shutdown_super() yet wait for it to happen. Either superblock
+ * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
+ * woken and we'll see SB_DYING.
+ *
+ * The caller must have acquired a temporary reference on @sb->s_count.
+ *
+ * Return: This returns true if SB_BORN was set, false if SB_DYING was
+ * set. The function acquires s_umount and returns with it held.
+ */
+static __must_check bool super_lock(struct super_block *sb, bool excl)
+{
+
+ lockdep_assert_not_held(&sb->s_umount);
+
+relock:
+ __super_lock(sb, excl);
+
+ /*
+ * Has gone through generic_shutdown_super() in the meantime.
+ * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
+ * grab a reference to this. Tell them so.
+ */
+ if (sb->s_flags & SB_DYING)
+ return false;
+
+ /* Has called ->get_tree() successfully. */
+ if (sb->s_flags & SB_BORN)
+ return true;
+
+ super_unlock(sb, excl);
+
+ /* wait until the superblock is ready or dying */
+ wait_var_event(&sb->s_flags, wait_born(sb));
+
+ /*
+ * Neither SB_BORN nor SB_DYING are ever unset so we never loop.
+ * Just reacquire @sb->s_umount for the caller.
+ */
+ goto relock;
+}
+
+/* wait and acquire read-side of @sb->s_umount */
+static inline bool super_lock_shared(struct super_block *sb)
+{
+ return super_lock(sb, false);
+}
+
+/* wait and acquire write-side of @sb->s_umount */
+static inline bool super_lock_excl(struct super_block *sb)
+{
+ return super_lock(sb, true);
+}
+
+/* wake waiters */
+#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD)
+static void super_wake(struct super_block *sb, unsigned int flag)
+{
+ WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS));
+ WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1);
+
+ /*
+ * Pairs with smp_load_acquire() in super_lock() to make sure
+ * all initializations in the superblock are seen by the user
+ * seeing SB_BORN sent.
+ */
+ smp_store_release(&sb->s_flags, sb->s_flags | flag);
+ /*
+ * Pairs with the barrier in prepare_to_wait_event() to make sure
+ * ___wait_var_event() either sees SB_BORN set or
+ * waitqueue_active() check in wake_up_var() sees the waiter.
+ */
+ smp_mb();
+ wake_up_var(&sb->s_flags);
+}
+
/*
* One thing we have to be careful of with a per-sb shrinker is that we don't
* drop the last active reference to the superblock from within the shrinker.
@@ -76,7 +200,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
- if (!trylock_super(sb))
+ if (!super_trylock_shared(sb))
return SHRINK_STOP;
if (sb->s_op->nr_cached_objects)
@@ -110,7 +234,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
freed += sb->s_op->free_cached_objects(sb, sc);
}
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
return freed;
}
@@ -123,17 +247,17 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sb = container_of(shrink, struct super_block, s_shrink);
/*
- * We don't call trylock_super() here as it is a scalability bottleneck,
- * so we're exposed to partial setup state. The shrinker rwsem does not
- * protect filesystem operations backing list_lru_shrink_count() or
- * s_op->nr_cached_objects(). Counts can change between
- * super_cache_count and super_cache_scan, so we really don't need locks
- * here.
+ * We don't call super_trylock_shared() here as it is a scalability
+ * bottleneck, so we're exposed to partial setup state. The shrinker
+ * rwsem does not protect filesystem operations backing
+ * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can
+ * change between super_cache_count and super_cache_scan, so we really
+ * don't need locks here.
*
* However, if we are currently mounting the superblock, the underlying
* filesystem might be in a state of partial construction and hence it
- * is dangerous to access it. trylock_super() uses a SB_BORN check to
- * avoid this situation, so do the same here. The memory barrier is
+ * is dangerous to access it. super_trylock_shared() uses a SB_BORN check
+ * to avoid this situation, so do the same here. The memory barrier is
* matched with the one in mount_fs() as we don't hold locks here.
*/
if (!(sb->s_flags & SB_BORN))
@@ -176,7 +300,7 @@ static void destroy_unused_super(struct super_block *s)
{
if (!s)
return;
- up_write(&s->s_umount);
+ super_unlock_excl(s);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
@@ -310,6 +434,33 @@ void put_super(struct super_block *sb)
spin_unlock(&sb_lock);
}
+static void kill_super_notify(struct super_block *sb)
+{
+ lockdep_assert_not_held(&sb->s_umount);
+
+ /* already notified earlier */
+ if (sb->s_flags & SB_DEAD)
+ return;
+
+ /*
+ * Remove it from @fs_supers so it isn't found by new
+ * sget{_fc}() walkers anymore. Any concurrent mounter still
+ * managing to grab a temporary reference is guaranteed to
+ * already see SB_DYING and will wait until we notify them about
+ * SB_DEAD.
+ */
+ spin_lock(&sb_lock);
+ hlist_del_init(&sb->s_instances);
+ spin_unlock(&sb_lock);
+
+ /*
+ * Let concurrent mounts know that this thing is really dead.
+ * We don't need @sb->s_umount here as every concurrent caller
+ * will see SB_DYING and either discard the superblock or wait
+ * for SB_DEAD.
+ */
+ super_wake(sb, SB_DEAD);
+}
/**
* deactivate_locked_super - drop an active reference to superblock
@@ -329,6 +480,8 @@ void deactivate_locked_super(struct super_block *s)
unregister_shrinker(&s->s_shrink);
fs->kill_sb(s);
+ kill_super_notify(s);
+
/*
* Since list_lru_destroy() may sleep, we cannot call it from
* put_super(), where we hold the sb_lock. Therefore we destroy
@@ -340,7 +493,7 @@ void deactivate_locked_super(struct super_block *s)
put_filesystem(fs);
put_super(s);
} else {
- up_write(&s->s_umount);
+ super_unlock_excl(s);
}
}
@@ -357,7 +510,7 @@ EXPORT_SYMBOL(deactivate_locked_super);
void deactivate_super(struct super_block *s)
{
if (!atomic_add_unless(&s->s_active, -1, 1)) {
- down_write(&s->s_umount);
+ __super_lock_excl(s);
deactivate_locked_super(s);
}
}
@@ -379,20 +532,61 @@ EXPORT_SYMBOL(deactivate_super);
*/
static int grab_super(struct super_block *s) __releases(sb_lock)
{
+ bool born;
+
s->s_count++;
spin_unlock(&sb_lock);
- down_write(&s->s_umount);
- if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) {
+ born = super_lock_excl(s);
+ if (born && atomic_inc_not_zero(&s->s_active)) {
put_super(s);
return 1;
}
- up_write(&s->s_umount);
+ super_unlock_excl(s);
put_super(s);
return 0;
}
+static inline bool wait_dead(struct super_block *sb)
+{
+ unsigned int flags;
+
+ /*
+ * Pairs with memory barrier in super_wake() and ensures
+ * that we see SB_DEAD after we're woken.
+ */
+ flags = smp_load_acquire(&sb->s_flags);
+ return flags & SB_DEAD;
+}
+
+/**
+ * grab_super_dead - acquire an active reference to a superblock
+ * @sb: superblock to acquire
+ *
+ * Acquire a temporary reference on a superblock and try to trade it for
+ * an active reference. This is used in sget{_fc}() to wait for a
+ * superblock to either become SB_BORN or for it to pass through
+ * sb->kill() and be marked as SB_DEAD.
+ *
+ * Return: This returns true if an active reference could be acquired,
+ * false if not.
+ */
+static bool grab_super_dead(struct super_block *sb)
+{
+
+ sb->s_count++;
+ if (grab_super(sb)) {
+ put_super(sb);
+ lockdep_assert_held(&sb->s_umount);
+ return true;
+ }
+ wait_var_event(&sb->s_flags, wait_dead(sb));
+ lockdep_assert_not_held(&sb->s_umount);
+ put_super(sb);
+ return false;
+}
+
/*
- * trylock_super - try to grab ->s_umount shared
+ * super_trylock_shared - try to grab ->s_umount shared
* @sb: reference we are trying to grab
*
* Try to prevent fs shutdown. This is used in places where we
@@ -408,13 +602,13 @@ static int grab_super(struct super_block *s) __releases(sb_lock)
* of down_read(). There's a couple of places that are OK with that, but
* it's very much not a general-purpose interface.
*/
-bool trylock_super(struct super_block *sb)
+bool super_trylock_shared(struct super_block *sb)
{
if (down_read_trylock(&sb->s_umount)) {
- if (!hlist_unhashed(&sb->s_instances) &&
- sb->s_root && (sb->s_flags & SB_BORN))
+ if (!(sb->s_flags & SB_DYING) && sb->s_root &&
+ (sb->s_flags & SB_BORN))
return true;
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
}
return false;
@@ -439,13 +633,13 @@ bool trylock_super(struct super_block *sb)
void retire_super(struct super_block *sb)
{
WARN_ON(!sb->s_bdev);
- down_write(&sb->s_umount);
+ __super_lock_excl(sb);
if (sb->s_iflags & SB_I_PERSB_BDI) {
bdi_unregister(sb->s_bdi);
sb->s_iflags &= ~SB_I_PERSB_BDI;
}
sb->s_iflags |= SB_I_RETIRED;
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
}
EXPORT_SYMBOL(retire_super);
@@ -517,11 +711,17 @@ void generic_shutdown_super(struct super_block *sb)
spin_unlock(&sb->s_inode_list_lock);
}
}
- spin_lock(&sb_lock);
- /* should be initialized for __put_super_and_need_restart() */
- hlist_del_init(&sb->s_instances);
- spin_unlock(&sb_lock);
- up_write(&sb->s_umount);
+ /*
+ * Broadcast to everyone that grabbed a temporary reference to this
+ * superblock before we removed it from @fs_supers that the superblock
+ * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
+ * discard this superblock and treat it as dead.
+ *
+ * We leave the superblock on @fs_supers so it can be found by
+ * sget{_fc}() until we passed sb->kill_sb().
+ */
+ super_wake(sb, SB_DYING);
+ super_unlock_excl(sb);
if (sb->s_bdi != &noop_backing_dev_info) {
if (sb->s_iflags & SB_I_PERSB_BDI)
bdi_unregister(sb->s_bdi);
@@ -546,17 +746,31 @@ bool mount_capable(struct fs_context *fc)
* @test: Comparison callback
* @set: Setup callback
*
- * Find or create a superblock using the parameters stored in the filesystem
- * context and the two callback functions.
+ * Create a new superblock or find an existing one.
+ *
+ * The @test callback is used to find a matching existing superblock.
+ * Whether or not the requested parameters in @fc are taken into account
+ * is specific to the @test callback that is used. They may even be
+ * completely ignored.
+ *
+ * If an extant superblock is matched, it will be returned unless:
+ *
+ * (1) the namespace the filesystem context @fc and the extant
+ * superblock's namespace differ
+ *
+ * (2) the filesystem context @fc has requested that reusing an extant
+ * superblock is not allowed
*
- * If an extant superblock is matched, then that will be returned with an
- * elevated reference count that the caller must transfer or discard.
+ * In both cases EBUSY will be returned.
*
* If no match is made, a new superblock will be allocated and basic
- * initialisation will be performed (s_type, s_fs_info and s_id will be set and
- * the set() callback will be invoked), the superblock will be published and it
- * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE
- * as yet unset.
+ * initialisation will be performed (s_type, s_fs_info and s_id will be
+ * set and the @set callback will be invoked), the superblock will be
+ * published and it will be returned in a partially constructed state
+ * with SB_BORN and SB_ACTIVE as yet unset.
+ *
+ * Return: On success, an extant or newly created superblock is
+ * returned. On failure an error pointer is returned.
*/
struct super_block *sget_fc(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
@@ -595,6 +809,11 @@ retry:
s->s_type = fc->fs_type;
s->s_iflags |= fc->s_iflags;
strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
+ /*
+ * Make the superblock visible on @super_blocks and @fs_supers.
+ * It's in a nascent state and users should wait on SB_BORN or
+ * SB_DYING to be set.
+ */
list_add_tail(&s->s_list, &super_blocks);
hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
spin_unlock(&sb_lock);
@@ -603,12 +822,16 @@ retry:
return s;
share_extant_sb:
- if (user_ns != old->s_user_ns) {
+ if (user_ns != old->s_user_ns || fc->exclusive) {
spin_unlock(&sb_lock);
destroy_unused_super(s);
+ if (fc->exclusive)
+ warnfc(fc, "reusing existing filesystem not allowed");
+ else
+ warnfc(fc, "reusing existing filesystem in another namespace not allowed");
return ERR_PTR(-EBUSY);
}
- if (!grab_super(old))
+ if (!grab_super_dead(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -652,7 +875,7 @@ retry:
destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
- if (!grab_super(old))
+ if (!grab_super_dead(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -685,7 +908,7 @@ EXPORT_SYMBOL(sget);
void drop_super(struct super_block *sb)
{
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
put_super(sb);
}
@@ -693,7 +916,7 @@ EXPORT_SYMBOL(drop_super);
void drop_super_exclusive(struct super_block *sb)
{
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);
@@ -704,7 +927,8 @@ static void __iterate_supers(void (*f)(struct super_block *))
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
+ /* Pairs with memory marrier in super_wake(). */
+ if (smp_load_acquire(&sb->s_flags) & SB_DYING)
continue;
sb->s_count++;
spin_unlock(&sb_lock);
@@ -734,15 +958,15 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
+ bool born;
+
sb->s_count++;
spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ born = super_lock_shared(sb);
+ if (born && sb->s_root)
f(sb, arg);
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
spin_lock(&sb_lock);
if (p)
@@ -770,13 +994,15 @@ void iterate_supers_type(struct file_system_type *type,
spin_lock(&sb_lock);
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
+ bool born;
+
sb->s_count++;
spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ born = super_lock_shared(sb);
+ if (born && sb->s_root)
f(sb, arg);
- up_read(&sb->s_umount);
+ super_unlock_shared(sb);
spin_lock(&sb_lock);
if (p)
@@ -791,43 +1017,6 @@ void iterate_supers_type(struct file_system_type *type,
EXPORT_SYMBOL(iterate_supers_type);
/**
- * get_super - get the superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device given. %NULL is returned if no match is found.
- */
-struct super_block *get_super(struct block_device *bdev)
-{
- struct super_block *sb;
-
- if (!bdev)
- return NULL;
-
- spin_lock(&sb_lock);
-rescan:
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
- if (sb->s_bdev == bdev) {
- sb->s_count++;
- spin_unlock(&sb_lock);
- down_read(&sb->s_umount);
- /* still alive? */
- if (sb->s_root && (sb->s_flags & SB_BORN))
- return sb;
- up_read(&sb->s_umount);
- /* nope, got unmounted */
- spin_lock(&sb_lock);
- __put_super(sb);
- goto rescan;
- }
- }
- spin_unlock(&sb_lock);
- return NULL;
-}
-
-/**
* get_active_super - get an active reference to the superblock of a device
* @bdev: device to get the superblock for
*
@@ -842,15 +1031,12 @@ struct super_block *get_active_super(struct block_device *bdev)
if (!bdev)
return NULL;
-restart:
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
if (sb->s_bdev == bdev) {
if (!grab_super(sb))
- goto restart;
- up_write(&sb->s_umount);
+ return NULL;
+ super_unlock_excl(sb);
return sb;
}
}
@@ -863,28 +1049,21 @@ struct super_block *user_get_super(dev_t dev, bool excl)
struct super_block *sb;
spin_lock(&sb_lock);
-rescan:
list_for_each_entry(sb, &super_blocks, s_list) {
- if (hlist_unhashed(&sb->s_instances))
- continue;
if (sb->s_dev == dev) {
+ bool born;
+
sb->s_count++;
spin_unlock(&sb_lock);
- if (excl)
- down_write(&sb->s_umount);
- else
- down_read(&sb->s_umount);
/* still alive? */
- if (sb->s_root && (sb->s_flags & SB_BORN))
+ born = super_lock(sb, excl);
+ if (born && sb->s_root)
return sb;
- if (excl)
- up_write(&sb->s_umount);
- else
- up_read(&sb->s_umount);
+ super_unlock(sb, excl);
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
- goto rescan;
+ break;
}
}
spin_unlock(&sb_lock);
@@ -926,9 +1105,9 @@ int reconfigure_super(struct fs_context *fc)
if (remount_ro) {
if (!hlist_empty(&sb->s_pins)) {
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
group_pin_kill(&sb->s_pins);
- down_write(&sb->s_umount);
+ __super_lock_excl(sb);
if (!sb->s_root)
return 0;
if (sb->s_writers.frozen != SB_UNFROZEN)
@@ -991,9 +1170,9 @@ cancel_readonly:
static void do_emergency_remount_callback(struct super_block *sb)
{
- down_write(&sb->s_umount);
- if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) &&
- !sb_rdonly(sb)) {
+ bool born = super_lock_excl(sb);
+
+ if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root,
@@ -1004,7 +1183,7 @@ static void do_emergency_remount_callback(struct super_block *sb)
put_fs_context(fc);
}
}
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
}
static void do_emergency_remount(struct work_struct *work)
@@ -1027,12 +1206,15 @@ void emergency_remount(void)
static void do_thaw_all_callback(struct super_block *sb)
{
- down_write(&sb->s_umount);
- if (sb->s_root && sb->s_flags & SB_BORN) {
- emergency_thaw_bdev(sb);
- thaw_super_locked(sb);
+ bool born = super_lock_excl(sb);
+
+ if (born && sb->s_root) {
+ if (IS_ENABLED(CONFIG_BLOCK))
+ while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
+ pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
+ thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
} else {
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
}
}
@@ -1108,6 +1290,7 @@ void kill_anon_super(struct super_block *sb)
{
dev_t dev = sb->s_dev;
generic_shutdown_super(sb);
+ kill_super_notify(sb);
free_anon_bdev(dev);
}
EXPORT_SYMBOL(kill_anon_super);
@@ -1136,7 +1319,7 @@ static int test_single_super(struct super_block *s, struct fs_context *fc)
return 1;
}
-static int vfs_get_super(struct fs_context *fc, bool reconf,
+static int vfs_get_super(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
@@ -1154,19 +1337,9 @@ static int vfs_get_super(struct fs_context *fc, bool reconf,
goto error;
sb->s_flags |= SB_ACTIVE;
- fc->root = dget(sb->s_root);
- } else {
- fc->root = dget(sb->s_root);
- if (reconf) {
- err = reconfigure_super(fc);
- if (err < 0) {
- dput(fc->root);
- fc->root = NULL;
- goto error;
- }
- }
}
+ fc->root = dget(sb->s_root);
return 0;
error:
@@ -1178,7 +1351,7 @@ int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, false, NULL, fill_super);
+ return vfs_get_super(fc, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);
@@ -1186,66 +1359,175 @@ int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc))
{
- return vfs_get_super(fc, false, test_single_super, fill_super);
+ return vfs_get_super(fc, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);
-int get_tree_single_reconf(struct fs_context *fc,
- int (*fill_super)(struct super_block *sb,
- struct fs_context *fc))
-{
- return vfs_get_super(fc, true, test_single_super, fill_super);
-}
-EXPORT_SYMBOL(get_tree_single_reconf);
-
int get_tree_keyed(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc),
void *key)
{
fc->s_fs_info = key;
- return vfs_get_super(fc, false, test_keyed_super, fill_super);
+ return vfs_get_super(fc, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);
+static int set_bdev_super(struct super_block *s, void *data)
+{
+ s->s_dev = *(dev_t *)data;
+ return 0;
+}
+
+static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
+{
+ return set_bdev_super(s, fc->sget_key);
+}
+
+static int super_s_dev_test(struct super_block *s, struct fs_context *fc)
+{
+ return !(s->s_iflags & SB_I_RETIRED) &&
+ s->s_dev == *(dev_t *)fc->sget_key;
+}
+
+/**
+ * sget_dev - Find or create a superblock by device number
+ * @fc: Filesystem context.
+ * @dev: device number
+ *
+ * Find or create a superblock using the provided device number that
+ * will be stored in fc->sget_key.
+ *
+ * If an extant superblock is matched, then that will be returned with
+ * an elevated reference count that the caller must transfer or discard.
+ *
+ * If no match is made, a new superblock will be allocated and basic
+ * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will
+ * be set). The superblock will be published and it will be returned in
+ * a partially constructed state with SB_BORN and SB_ACTIVE as yet
+ * unset.
+ *
+ * Return: an existing or newly created superblock on success, an error
+ * pointer on failure.
+ */
+struct super_block *sget_dev(struct fs_context *fc, dev_t dev)
+{
+ fc->sget_key = &dev;
+ return sget_fc(fc, super_s_dev_test, super_s_dev_set);
+}
+EXPORT_SYMBOL(sget_dev);
+
#ifdef CONFIG_BLOCK
-static void fs_mark_dead(struct block_device *bdev)
+/*
+ * Lock a super block that the callers holds a reference to.
+ *
+ * The caller needs to ensure that the super_block isn't being freed while
+ * calling this function, e.g. by holding a lock over the call to this function
+ * and the place that clears the pointer to the superblock used by this function
+ * before freeing the superblock.
+ */
+static bool super_lock_shared_active(struct super_block *sb)
{
- struct super_block *sb;
+ bool born = super_lock_shared(sb);
- sb = get_super(bdev);
- if (!sb)
+ if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
+ super_unlock_shared(sb);
+ return false;
+ }
+ return true;
+}
+
+static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+ struct super_block *sb = bdev->bd_holder;
+
+ /* bd_holder_lock ensures that the sb isn't freed */
+ lockdep_assert_held(&bdev->bd_holder_lock);
+
+ if (!super_lock_shared_active(sb))
return;
+ if (!surprise)
+ sync_filesystem(sb);
+ shrink_dcache_sb(sb);
+ invalidate_inodes(sb);
if (sb->s_op->shutdown)
sb->s_op->shutdown(sb);
- drop_super(sb);
-}
-static const struct blk_holder_ops fs_holder_ops = {
- .mark_dead = fs_mark_dead,
-};
+ super_unlock_shared(sb);
+}
-static int set_bdev_super(struct super_block *s, void *data)
+static void fs_bdev_sync(struct block_device *bdev)
{
- s->s_bdev = data;
- s->s_dev = s->s_bdev->bd_dev;
- s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi);
+ struct super_block *sb = bdev->bd_holder;
- if (bdev_stable_writes(s->s_bdev))
- s->s_iflags |= SB_I_STABLE_WRITES;
- return 0;
-}
+ lockdep_assert_held(&bdev->bd_holder_lock);
-static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
-{
- return set_bdev_super(s, fc->sget_key);
+ if (!super_lock_shared_active(sb))
+ return;
+ sync_filesystem(sb);
+ super_unlock_shared(sb);
}
-static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
+const struct blk_holder_ops fs_holder_ops = {
+ .mark_dead = fs_bdev_mark_dead,
+ .sync = fs_bdev_sync,
+};
+EXPORT_SYMBOL_GPL(fs_holder_ops);
+
+int setup_bdev_super(struct super_block *sb, int sb_flags,
+ struct fs_context *fc)
{
- return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key;
+ blk_mode_t mode = sb_open_mode(sb_flags);
+ struct block_device *bdev;
+
+ bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+ if (IS_ERR(bdev)) {
+ if (fc)
+ errorf(fc, "%s: Can't open blockdev", fc->source);
+ return PTR_ERR(bdev);
+ }
+
+ /*
+ * This really should be in blkdev_get_by_dev, but right now can't due
+ * to legacy issues that require us to allow opening a block device node
+ * writable from userspace even for a read-only block device.
+ */
+ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
+ blkdev_put(bdev, sb);
+ return -EACCES;
+ }
+
+ /*
+ * Until SB_BORN flag is set, there can be no active superblock
+ * references and thus no filesystem freezing. get_active_super() will
+ * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed.
+ *
+ * It is enough to check bdev was not frozen before we set s_bdev.
+ */
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+ if (bdev->bd_fsfreeze_count > 0) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ if (fc)
+ warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
+ blkdev_put(bdev, sb);
+ return -EBUSY;
+ }
+ spin_lock(&sb_lock);
+ sb->s_bdev = bdev;
+ sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
+ if (bdev_stable_writes(bdev))
+ sb->s_iflags |= SB_I_STABLE_WRITES;
+ spin_unlock(&sb_lock);
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
+ snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name,
+ sb->s_id);
+ sb_set_blocksize(sb, block_size(bdev));
+ return 0;
}
+EXPORT_SYMBOL_GPL(setup_bdev_super);
/**
* get_tree_bdev - Get a superblock based on a single block device
@@ -1256,73 +1538,48 @@ int get_tree_bdev(struct fs_context *fc,
int (*fill_super)(struct super_block *,
struct fs_context *))
{
- struct block_device *bdev;
struct super_block *s;
int error = 0;
+ dev_t dev;
if (!fc->source)
return invalf(fc, "No source specified");
- bdev = blkdev_get_by_path(fc->source, sb_open_mode(fc->sb_flags),
- fc->fs_type, &fs_holder_ops);
- if (IS_ERR(bdev)) {
- errorf(fc, "%s: Can't open blockdev", fc->source);
- return PTR_ERR(bdev);
- }
-
- /* Once the superblock is inserted into the list by sget_fc(), s_umount
- * will protect the lockfs code from trying to start a snapshot while
- * we are mounting
- */
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
- blkdev_put(bdev, fc->fs_type);
- return -EBUSY;
+ error = lookup_bdev(fc->source, &dev);
+ if (error) {
+ errorf(fc, "%s: Can't lookup blockdev", fc->source);
+ return error;
}
fc->sb_flags |= SB_NOSEC;
- fc->sget_key = bdev;
- s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- if (IS_ERR(s)) {
- blkdev_put(bdev, fc->fs_type);
+ s = sget_dev(fc, dev);
+ if (IS_ERR(s))
return PTR_ERR(s);
- }
if (s->s_root) {
/* Don't summarily change the RO/RW state. */
if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
- warnf(fc, "%pg: Can't mount, would change RO state", bdev);
+ warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
deactivate_locked_super(s);
- blkdev_put(bdev, fc->fs_type);
return -EBUSY;
}
-
+ } else {
/*
- * s_umount nests inside open_mutex during
- * __invalidate_device(). blkdev_put() acquires
- * open_mutex and can't be called under s_umount. Drop
- * s_umount temporarily. This is safe as we're
- * holding an active reference.
+ * We drop s_umount here because we need to open the bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * bdev_mark_dead()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
*/
- up_write(&s->s_umount);
- blkdev_put(bdev, fc->fs_type);
- down_write(&s->s_umount);
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
- fc->fs_type->name, s->s_id);
- sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, fc);
+ super_unlock_excl(s);
+ error = setup_bdev_super(s, fc->sb_flags, fc);
+ __super_lock_excl(s);
+ if (!error)
+ error = fill_super(s, fc);
if (error) {
deactivate_locked_super(s);
return error;
}
-
s->s_flags |= SB_ACTIVE;
- bdev->bd_super = s;
}
BUG_ON(fc->root);
@@ -1333,79 +1590,52 @@ EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
{
- return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data;
+ return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}
struct dentry *mount_bdev(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data,
int (*fill_super)(struct super_block *, void *, int))
{
- struct block_device *bdev;
struct super_block *s;
- int error = 0;
+ int error;
+ dev_t dev;
- bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type,
- &fs_holder_ops);
- if (IS_ERR(bdev))
- return ERR_CAST(bdev);
+ error = lookup_bdev(dev_name, &dev);
+ if (error)
+ return ERR_PTR(error);
- /*
- * once the super is inserted into the list by sget, s_umount
- * will protect the lockfs code from trying to start a snapshot
- * while we are mounting
- */
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
- error = -EBUSY;
- goto error_bdev;
- }
- s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC,
- bdev);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ flags |= SB_NOSEC;
+ s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
if (IS_ERR(s))
- goto error_s;
+ return ERR_CAST(s);
if (s->s_root) {
if ((flags ^ s->s_flags) & SB_RDONLY) {
deactivate_locked_super(s);
- error = -EBUSY;
- goto error_bdev;
+ return ERR_PTR(-EBUSY);
}
-
+ } else {
/*
- * s_umount nests inside open_mutex during
- * __invalidate_device(). blkdev_put() acquires
- * open_mutex and can't be called under s_umount. Drop
- * s_umount temporarily. This is safe as we're
- * holding an active reference.
+ * We drop s_umount here because we need to open the bdev and
+ * bdev->open_mutex ranks above s_umount (blkdev_put() ->
+ * bdev_mark_dead()). It is safe because we have active sb
+ * reference and SB_BORN is not set yet.
*/
- up_write(&s->s_umount);
- blkdev_put(bdev, fs_type);
- down_write(&s->s_umount);
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s",
- fs_type->name, s->s_id);
- sb_set_blocksize(s, block_size(bdev));
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
+ super_unlock_excl(s);
+ error = setup_bdev_super(s, flags, NULL);
+ __super_lock_excl(s);
+ if (!error)
+ error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
deactivate_locked_super(s);
- goto error;
+ return ERR_PTR(error);
}
s->s_flags |= SB_ACTIVE;
- bdev->bd_super = s;
}
return dget(s->s_root);
-
-error_s:
- error = PTR_ERR(s);
-error_bdev:
- blkdev_put(bdev, fs_type);
-error:
- return ERR_PTR(error);
}
EXPORT_SYMBOL(mount_bdev);
@@ -1413,10 +1643,11 @@ void kill_block_super(struct super_block *sb)
{
struct block_device *bdev = sb->s_bdev;
- bdev->bd_super = NULL;
generic_shutdown_super(sb);
- sync_blockdev(bdev);
- blkdev_put(bdev, sb->s_type);
+ if (bdev) {
+ sync_blockdev(bdev);
+ blkdev_put(bdev, sb);
+ }
}
EXPORT_SYMBOL(kill_block_super);
@@ -1533,13 +1764,13 @@ int vfs_get_tree(struct fs_context *fc)
WARN_ON(!sb->s_bdi);
/*
- * Write barrier is for super_cache_count(). We place it before setting
- * SB_BORN as the data dependency between the two functions is the
- * superblock structure contents that we just set up, not the SB_BORN
- * flag.
+ * super_wake() contains a memory barrier which also care of
+ * ordering for super_cache_count(). We place it before setting
+ * SB_BORN as the data dependency between the two functions is
+ * the superblock structure contents that we just set up, not
+ * the SB_BORN flag.
*/
- smp_wmb();
- sb->s_flags |= SB_BORN;
+ super_wake(sb, SB_BORN);
error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
if (unlikely(error)) {
@@ -1644,14 +1875,43 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
percpu_up_write(sb->s_writers.rw_sem + level);
}
+static int wait_for_partially_frozen(struct super_block *sb)
+{
+ int ret = 0;
+
+ do {
+ unsigned short old = sb->s_writers.frozen;
+
+ up_write(&sb->s_umount);
+ ret = wait_var_event_killable(&sb->s_writers.frozen,
+ sb->s_writers.frozen != old);
+ down_write(&sb->s_umount);
+ } while (ret == 0 &&
+ sb->s_writers.frozen != SB_UNFROZEN &&
+ sb->s_writers.frozen != SB_FREEZE_COMPLETE);
+
+ return ret;
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
+ * @who: context that wants to freeze
*
* Syncs the super to make sure the filesystem is consistent and calls the fs's
- * freeze_fs. Subsequent calls to this without first thawing the fs will return
+ * freeze_fs. Subsequent calls to this without first thawing the fs may return
* -EBUSY.
*
+ * @who should be:
+ * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
+ * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
+ *
+ * The @who argument distinguishes between the kernel and userspace trying to
+ * freeze the filesystem. Although there cannot be multiple kernel freezes or
+ * multiple userspace freezes in effect at any given time, the kernel and
+ * userspace can both hold a filesystem frozen. The filesystem remains frozen
+ * until there are no kernel or userspace freezes in effect.
+ *
* During this function, sb->s_writers.frozen goes through these values:
*
* SB_UNFROZEN: File system is normal, all writes progress as usual.
@@ -1677,34 +1937,62 @@ static void sb_freeze_unlock(struct super_block *sb, int level)
*
* sb->s_writers.frozen is protected by sb->s_umount.
*/
-int freeze_super(struct super_block *sb)
+int freeze_super(struct super_block *sb, enum freeze_holder who)
{
int ret;
atomic_inc(&sb->s_active);
- down_write(&sb->s_umount);
+ if (!super_lock_excl(sb))
+ WARN(1, "Dying superblock while freezing!");
+
+retry:
+ if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
+ if (sb->s_writers.freeze_holders & who) {
+ deactivate_locked_super(sb);
+ return -EBUSY;
+ }
+
+ WARN_ON(sb->s_writers.freeze_holders == 0);
+
+ /*
+ * Someone else already holds this type of freeze; share the
+ * freeze and assign the active ref to the freeze.
+ */
+ sb->s_writers.freeze_holders |= who;
+ super_unlock_excl(sb);
+ return 0;
+ }
+
if (sb->s_writers.frozen != SB_UNFROZEN) {
- deactivate_locked_super(sb);
- return -EBUSY;
+ ret = wait_for_partially_frozen(sb);
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+
+ goto retry;
}
if (!(sb->s_flags & SB_BORN)) {
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
return 0; /* sic - it's "nothing to do" */
}
if (sb_rdonly(sb)) {
/* Nothing to do really... */
+ sb->s_writers.freeze_holders |= who;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
- up_write(&sb->s_umount);
+ wake_up_var(&sb->s_writers.frozen);
+ super_unlock_excl(sb);
return 0;
}
sb->s_writers.frozen = SB_FREEZE_WRITE;
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
sb_wait_write(sb, SB_FREEZE_WRITE);
- down_write(&sb->s_umount);
+ if (!super_lock_excl(sb))
+ WARN(1, "Dying superblock while freezing!");
/* Now we go and block page faults... */
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
@@ -1715,6 +2003,7 @@ int freeze_super(struct super_block *sb)
if (ret) {
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
+ wake_up_var(&sb->s_writers.frozen);
deactivate_locked_super(sb);
return ret;
}
@@ -1730,6 +2019,7 @@ int freeze_super(struct super_block *sb)
"VFS:Filesystem freeze failed\n");
sb->s_writers.frozen = SB_UNFROZEN;
sb_freeze_unlock(sb, SB_FREEZE_FS);
+ wake_up_var(&sb->s_writers.frozen);
deactivate_locked_super(sb);
return ret;
}
@@ -1738,24 +2028,50 @@ int freeze_super(struct super_block *sb)
* For debugging purposes so that fs can warn if it sees write activity
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
+ sb->s_writers.freeze_holders |= who;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
+ wake_up_var(&sb->s_writers.frozen);
lockdep_sb_freeze_release(sb);
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
return 0;
}
EXPORT_SYMBOL(freeze_super);
-static int thaw_super_locked(struct super_block *sb)
+/*
+ * Undoes the effect of a freeze_super_locked call. If the filesystem is
+ * frozen both by userspace and the kernel, a thaw call from either source
+ * removes that state without releasing the other state or unlocking the
+ * filesystem.
+ */
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
{
int error;
- if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
- up_write(&sb->s_umount);
+ if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
+ if (!(sb->s_writers.freeze_holders & who)) {
+ super_unlock_excl(sb);
+ return -EINVAL;
+ }
+
+ /*
+ * Freeze is shared with someone else. Release our hold and
+ * drop the active ref that freeze_super assigned to the
+ * freezer.
+ */
+ if (sb->s_writers.freeze_holders & ~who) {
+ sb->s_writers.freeze_holders &= ~who;
+ deactivate_locked_super(sb);
+ return 0;
+ }
+ } else {
+ super_unlock_excl(sb);
return -EINVAL;
}
if (sb_rdonly(sb)) {
+ sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
+ wake_up_var(&sb->s_writers.frozen);
goto out;
}
@@ -1764,15 +2080,16 @@ static int thaw_super_locked(struct super_block *sb)
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
- printk(KERN_ERR
- "VFS:Filesystem thaw failed\n");
+ printk(KERN_ERR "VFS:Filesystem thaw failed\n");
lockdep_sb_freeze_release(sb);
- up_write(&sb->s_umount);
+ super_unlock_excl(sb);
return error;
}
}
+ sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
+ wake_up_var(&sb->s_writers.frozen);
sb_freeze_unlock(sb, SB_FREEZE_FS);
out:
deactivate_locked_super(sb);
@@ -1782,13 +2099,20 @@ out:
/**
* thaw_super -- unlock filesystem
* @sb: the super to thaw
+ * @who: context that wants to freeze
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_super()
+ * if there are no remaining freezes on the filesystem.
*
- * Unlocks the filesystem and marks it writeable again after freeze_super().
+ * @who should be:
+ * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
+ * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
*/
-int thaw_super(struct super_block *sb)
+int thaw_super(struct super_block *sb, enum freeze_holder who)
{
- down_write(&sb->s_umount);
- return thaw_super_locked(sb);
+ if (!super_lock_excl(sb))
+ WARN(1, "Dying superblock while thawing!");
+ return thaw_super_locked(sb, who);
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
index b4e23e03fbeb..67b3f90afbfd 100644
--- a/fs/sysv/Kconfig
+++ b/fs/sysv/Kconfig
@@ -2,6 +2,7 @@
config SYSV_FS
tristate "System V/Xenix/V7/Coherent file system support"
depends on BLOCK
+ select BUFFER_HEAD
help
SCO, Xenix and Coherent are commercial Unix systems for Intel
machines, and Version 7 was used on the DEC PDP-11. Saying Y
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 0140010aa0c3..2f5ead88d00b 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -224,7 +224,7 @@ got_it:
memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
err = sysv_handle_dirsync(dir);
out_page:
@@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page)
}
de->inode = 0;
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
return sysv_handle_dirsync(inode);
}
@@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page,
}
de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
dir_commit_chunk(page, pos, SYSV_DIRSIZE);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
return sysv_handle_dirsync(inode);
}
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index e732879036ab..6719da5889d9 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
dirty_sb(sb);
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_ino = fs16_to_cpu(sbi, ino);
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_blocks = 0;
memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
SYSV_I(inode)->i_dir_start_lookup = 0;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 9e8d4a6fb2f3..0aa3827d8178 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -202,8 +202,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime);
inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime);
- inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime);
- inode->i_ctime.tv_nsec = 0;
+ inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
inode->i_atime.tv_nsec = 0;
inode->i_mtime.tv_nsec = 0;
inode->i_blocks = 0;
@@ -256,7 +255,7 @@ static int __sysv_write_inode(struct inode *inode, int wait)
raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec);
raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec);
- raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec);
+ raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec);
si = SYSV_I(inode);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 58d7f43a1371..edb94e55de8e 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -183,7 +183,7 @@ static inline int splice_branch(struct inode *inode,
*where->p = where->key;
write_unlock(&pointers_lock);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
/* had we spliced it onto indirect block? */
if (where->bh)
@@ -423,7 +423,7 @@ do_indirects:
}
n++;
}
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (IS_SYNC(inode))
sysv_sync_inode (inode);
else
@@ -449,7 +449,8 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, unsigned int flags)
{
struct super_block *s = path->dentry->d_sb;
- generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
+ stat);
stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
stat->blksize = s->s_blocksize;
return 0;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index fcf163fea3ad..d6b73798071b 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -103,7 +103,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir,
{
struct inode *inode = d_inode(old_dentry);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -161,7 +161,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry)
err = sysv_delete_entry(de, page);
if (!err) {
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
}
unmap_and_put_page(page, de);
@@ -230,7 +230,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
unmap_and_put_page(new_page, new_de);
if (err)
goto out_dir;
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
index 7c35a282b484..73c56da8e284 100644
--- a/fs/tracefs/Makefile
+++ b/fs/tracefs/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
tracefs-objs := inode.o
+tracefs-objs += event_inode.o
obj-$(CONFIG_TRACING) += tracefs.o
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
new file mode 100644
index 000000000000..8c8d64e76103
--- /dev/null
+++ b/fs/tracefs/event_inode.c
@@ -0,0 +1,905 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * event_inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Copyright (C) 2020-23 VMware Inc, author: Steven Rostedt (VMware) <rostedt@goodmis.org>
+ * Copyright (C) 2020-23 VMware Inc, author: Ajay Kaher <akaher@vmware.com>
+ *
+ * eventfs is used to dynamically create inodes and dentries based on the
+ * meta data provided by the tracing system.
+ *
+ * eventfs stores the meta-data of files/dirs and holds off on creating
+ * inodes/dentries of the files. When accessed, the eventfs will create the
+ * inodes/dentries in a just-in-time (JIT) manner. The eventfs will clean up
+ * and delete the inodes/dentries when they are no longer referenced.
+ */
+#include <linux/fsnotify.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/tracefs.h>
+#include <linux/kref.h>
+#include <linux/delay.h>
+#include "internal.h"
+
+struct eventfs_inode {
+ struct list_head e_top_files;
+};
+
+/*
+ * struct eventfs_file - hold the properties of the eventfs files and
+ * directories.
+ * @name: the name of the file or directory to create
+ * @d_parent: holds parent's dentry
+ * @dentry: once accessed holds dentry
+ * @list: file or directory to be added to parent directory
+ * @ei: list of files and directories within directory
+ * @fop: file_operations for file or directory
+ * @iop: inode_operations for file or directory
+ * @data: something that the caller will want to get to later on
+ * @mode: the permission that the file or directory should have
+ */
+struct eventfs_file {
+ const char *name;
+ struct dentry *d_parent;
+ struct dentry *dentry;
+ struct list_head list;
+ struct eventfs_inode *ei;
+ const struct file_operations *fop;
+ const struct inode_operations *iop;
+ /*
+ * Union - used for deletion
+ * @del_list: list of eventfs_file to delete
+ * @rcu: eventfs_file to delete in RCU
+ * @is_freed: node is freed if one of the above is set
+ */
+ union {
+ struct list_head del_list;
+ struct rcu_head rcu;
+ unsigned long is_freed;
+ };
+ void *data;
+ umode_t mode;
+};
+
+static DEFINE_MUTEX(eventfs_mutex);
+DEFINE_STATIC_SRCU(eventfs_srcu);
+
+static struct dentry *eventfs_root_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags);
+static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
+static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
+static int eventfs_release(struct inode *inode, struct file *file);
+
+static const struct inode_operations eventfs_root_dir_inode_operations = {
+ .lookup = eventfs_root_lookup,
+};
+
+static const struct file_operations eventfs_file_operations = {
+ .open = dcache_dir_open_wrapper,
+ .read = generic_read_dir,
+ .iterate_shared = dcache_readdir_wrapper,
+ .llseek = generic_file_llseek,
+ .release = eventfs_release,
+};
+
+/**
+ * create_file - create a file in the tracefs filesystem
+ * @name: the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: parent dentry for this file.
+ * @data: something that the caller will want to get to later on.
+ * @fop: struct file_operations that should be used for this file.
+ *
+ * This is the basic "create a file" function for tracefs. It allows for a
+ * wide range of flexibility in creating a file.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+static struct dentry *create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fop)
+{
+ struct tracefs_inode *ti;
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+
+ if (WARN_ON_ONCE(!S_ISREG(mode)))
+ return NULL;
+
+ dentry = eventfs_start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return dentry;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return eventfs_failed_creating(dentry);
+
+ inode->i_mode = mode;
+ inode->i_fop = fop;
+ inode->i_private = data;
+
+ ti = get_tracefs(inode);
+ ti->flags |= TRACEFS_EVENT_INODE;
+ d_instantiate(dentry, inode);
+ fsnotify_create(dentry->d_parent->d_inode, dentry);
+ return eventfs_end_creating(dentry);
+};
+
+/**
+ * create_dir - create a dir in the tracefs filesystem
+ * @name: the name of the file to create.
+ * @parent: parent dentry for this file.
+ * @data: something that the caller will want to get to later on.
+ *
+ * This is the basic "create a dir" function for eventfs. It allows for a
+ * wide range of flexibility in creating a dir.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+static struct dentry *create_dir(const char *name, struct dentry *parent, void *data)
+{
+ struct tracefs_inode *ti;
+ struct dentry *dentry;
+ struct inode *inode;
+
+ dentry = eventfs_start_creating(name, parent);
+ if (IS_ERR(dentry))
+ return dentry;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return eventfs_failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = &eventfs_root_dir_inode_operations;
+ inode->i_fop = &eventfs_file_operations;
+ inode->i_private = data;
+
+ ti = get_tracefs(inode);
+ ti->flags |= TRACEFS_EVENT_INODE;
+
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return eventfs_end_creating(dentry);
+}
+
+/**
+ * eventfs_set_ef_status_free - set the ef->status to free
+ * @ti: the tracefs_inode of the dentry
+ * @dentry: dentry who's status to be freed
+ *
+ * eventfs_set_ef_status_free will be called if no more
+ * references remain
+ */
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+{
+ struct tracefs_inode *ti_parent;
+ struct eventfs_inode *ei;
+ struct eventfs_file *ef, *tmp;
+
+ /* The top level events directory may be freed by this */
+ if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) {
+ LIST_HEAD(ef_del_list);
+
+ mutex_lock(&eventfs_mutex);
+
+ ei = ti->private;
+
+ /* Record all the top level files */
+ list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ lockdep_is_held(&eventfs_mutex)) {
+ list_add_tail(&ef->del_list, &ef_del_list);
+ }
+
+ /* Nothing should access this, but just in case! */
+ ti->private = NULL;
+
+ mutex_unlock(&eventfs_mutex);
+
+ /* Now safely free the top level files and their children */
+ list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
+ list_del(&ef->del_list);
+ eventfs_remove(ef);
+ }
+
+ kfree(ei);
+ return;
+ }
+
+ mutex_lock(&eventfs_mutex);
+
+ ti_parent = get_tracefs(dentry->d_parent->d_inode);
+ if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE))
+ goto out;
+
+ ef = dentry->d_fsdata;
+ if (!ef)
+ goto out;
+
+ /*
+ * If ef was freed, then the LSB bit is set for d_fsdata.
+ * But this should not happen, as it should still have a
+ * ref count that prevents it. Warn in case it does.
+ */
+ if (WARN_ON_ONCE((unsigned long)ef & 1))
+ goto out;
+
+ dentry->d_fsdata = NULL;
+ ef->dentry = NULL;
+out:
+ mutex_unlock(&eventfs_mutex);
+}
+
+/**
+ * eventfs_post_create_dir - post create dir routine
+ * @ef: eventfs_file of recently created dir
+ *
+ * Map the meta-data of files within an eventfs dir to their parent dentry
+ */
+static void eventfs_post_create_dir(struct eventfs_file *ef)
+{
+ struct eventfs_file *ef_child;
+ struct tracefs_inode *ti;
+
+ /* srcu lock already held */
+ /* fill parent-child relation */
+ list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+ ef_child->d_parent = ef->dentry;
+ }
+
+ ti = get_tracefs(ef->dentry->d_inode);
+ ti->private = ef->ei;
+}
+
+/**
+ * create_dentry - helper function to create dentry
+ * @ef: eventfs_file of file or directory to create
+ * @parent: parent dentry
+ * @lookup: true if called from lookup routine
+ *
+ * Used to create a dentry for file/dir, executes post dentry creation routine
+ */
+static struct dentry *
+create_dentry(struct eventfs_file *ef, struct dentry *parent, bool lookup)
+{
+ bool invalidate = false;
+ struct dentry *dentry;
+
+ mutex_lock(&eventfs_mutex);
+ if (ef->is_freed) {
+ mutex_unlock(&eventfs_mutex);
+ return NULL;
+ }
+ if (ef->dentry) {
+ dentry = ef->dentry;
+ /* On dir open, up the ref count */
+ if (!lookup)
+ dget(dentry);
+ mutex_unlock(&eventfs_mutex);
+ return dentry;
+ }
+ mutex_unlock(&eventfs_mutex);
+
+ if (!lookup)
+ inode_lock(parent->d_inode);
+
+ if (ef->ei)
+ dentry = create_dir(ef->name, parent, ef->data);
+ else
+ dentry = create_file(ef->name, ef->mode, parent,
+ ef->data, ef->fop);
+
+ if (!lookup)
+ inode_unlock(parent->d_inode);
+
+ mutex_lock(&eventfs_mutex);
+ if (IS_ERR_OR_NULL(dentry)) {
+ /* If the ef was already updated get it */
+ dentry = ef->dentry;
+ if (dentry && !lookup)
+ dget(dentry);
+ mutex_unlock(&eventfs_mutex);
+ return dentry;
+ }
+
+ if (!ef->dentry && !ef->is_freed) {
+ ef->dentry = dentry;
+ if (ef->ei)
+ eventfs_post_create_dir(ef);
+ dentry->d_fsdata = ef;
+ } else {
+ /* A race here, should try again (unless freed) */
+ invalidate = true;
+
+ /*
+ * Should never happen unless we get here due to being freed.
+ * Otherwise it means two dentries exist with the same name.
+ */
+ WARN_ON_ONCE(!ef->is_freed);
+ }
+ mutex_unlock(&eventfs_mutex);
+ if (invalidate)
+ d_invalidate(dentry);
+
+ if (lookup || invalidate)
+ dput(dentry);
+
+ return invalidate ? NULL : dentry;
+}
+
+static bool match_event_file(struct eventfs_file *ef, const char *name)
+{
+ bool ret;
+
+ mutex_lock(&eventfs_mutex);
+ ret = !ef->is_freed && strcmp(ef->name, name) == 0;
+ mutex_unlock(&eventfs_mutex);
+
+ return ret;
+}
+
+/**
+ * eventfs_root_lookup - lookup routine to create file/dir
+ * @dir: in which a lookup is being done
+ * @dentry: file/dir dentry
+ * @flags: to pass as flags parameter to simple lookup
+ *
+ * Used to create a dynamic file/dir within @dir. Use the eventfs_inode
+ * list of meta data to find the information needed to create the file/dir.
+ */
+static struct dentry *eventfs_root_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags)
+{
+ struct tracefs_inode *ti;
+ struct eventfs_inode *ei;
+ struct eventfs_file *ef;
+ struct dentry *ret = NULL;
+ int idx;
+
+ ti = get_tracefs(dir);
+ if (!(ti->flags & TRACEFS_EVENT_INODE))
+ return NULL;
+
+ ei = ti->private;
+ idx = srcu_read_lock(&eventfs_srcu);
+ list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+ if (!match_event_file(ef, dentry->d_name.name))
+ continue;
+ ret = simple_lookup(dir, dentry, flags);
+ create_dentry(ef, ef->d_parent, true);
+ break;
+ }
+ srcu_read_unlock(&eventfs_srcu, idx);
+ return ret;
+}
+
+struct dentry_list {
+ void *cursor;
+ struct dentry **dentries;
+};
+
+/**
+ * eventfs_release - called to release eventfs file/dir
+ * @inode: inode to be released
+ * @file: file to be released (not used)
+ */
+static int eventfs_release(struct inode *inode, struct file *file)
+{
+ struct tracefs_inode *ti;
+ struct dentry_list *dlist = file->private_data;
+ void *cursor;
+ int i;
+
+ ti = get_tracefs(inode);
+ if (!(ti->flags & TRACEFS_EVENT_INODE))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!dlist))
+ return -EINVAL;
+
+ for (i = 0; dlist->dentries && dlist->dentries[i]; i++) {
+ dput(dlist->dentries[i]);
+ }
+
+ cursor = dlist->cursor;
+ kfree(dlist->dentries);
+ kfree(dlist);
+ file->private_data = cursor;
+ return dcache_dir_close(inode, file);
+}
+
+/**
+ * dcache_dir_open_wrapper - eventfs open wrapper
+ * @inode: not used
+ * @file: dir to be opened (to create its child)
+ *
+ * Used to dynamically create the file/dir within @file. @file is really a
+ * directory and all the files/dirs of the children within @file will be
+ * created. If any of the files/dirs have already been created, their
+ * reference count will be incremented.
+ */
+static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
+{
+ struct tracefs_inode *ti;
+ struct eventfs_inode *ei;
+ struct eventfs_file *ef;
+ struct dentry_list *dlist;
+ struct dentry **dentries = NULL;
+ struct dentry *dentry = file_dentry(file);
+ struct dentry *d;
+ struct inode *f_inode = file_inode(file);
+ int cnt = 0;
+ int idx;
+ int ret;
+
+ ti = get_tracefs(f_inode);
+ if (!(ti->flags & TRACEFS_EVENT_INODE))
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(file->private_data))
+ return -EINVAL;
+
+ dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
+ if (!dlist)
+ return -ENOMEM;
+
+ ei = ti->private;
+ idx = srcu_read_lock(&eventfs_srcu);
+ list_for_each_entry_srcu(ef, &ei->e_top_files, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+ d = create_dentry(ef, dentry, false);
+ if (d) {
+ struct dentry **tmp;
+
+ tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL);
+ if (!tmp)
+ break;
+ tmp[cnt] = d;
+ tmp[cnt + 1] = NULL;
+ cnt++;
+ dentries = tmp;
+ }
+ }
+ srcu_read_unlock(&eventfs_srcu, idx);
+ ret = dcache_dir_open(inode, file);
+
+ /*
+ * dcache_dir_open() sets file->private_data to a dentry cursor.
+ * Need to save that but also save all the dentries that were
+ * opened by this function.
+ */
+ dlist->cursor = file->private_data;
+ dlist->dentries = dentries;
+ file->private_data = dlist;
+ return ret;
+}
+
+/*
+ * This just sets the file->private_data back to the cursor and back.
+ */
+static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
+{
+ struct dentry_list *dlist = file->private_data;
+ int ret;
+
+ file->private_data = dlist->cursor;
+ ret = dcache_readdir(file, ctx);
+ dlist->cursor = file->private_data;
+ file->private_data = dlist;
+ return ret;
+}
+
+/**
+ * eventfs_prepare_ef - helper function to prepare eventfs_file
+ * @name: the name of the file/directory to create.
+ * @mode: the permission that the file should have.
+ * @fop: struct file_operations that should be used for this file/directory.
+ * @iop: struct inode_operations that should be used for this file/directory.
+ * @data: something that the caller will want to get to later on. The
+ * inode.i_private pointer will point to this value on the open() call.
+ *
+ * This function allocates and fills the eventfs_file structure.
+ */
+static struct eventfs_file *eventfs_prepare_ef(const char *name, umode_t mode,
+ const struct file_operations *fop,
+ const struct inode_operations *iop,
+ void *data)
+{
+ struct eventfs_file *ef;
+
+ ef = kzalloc(sizeof(*ef), GFP_KERNEL);
+ if (!ef)
+ return ERR_PTR(-ENOMEM);
+
+ ef->name = kstrdup(name, GFP_KERNEL);
+ if (!ef->name) {
+ kfree(ef);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ if (S_ISDIR(mode)) {
+ ef->ei = kzalloc(sizeof(*ef->ei), GFP_KERNEL);
+ if (!ef->ei) {
+ kfree(ef->name);
+ kfree(ef);
+ return ERR_PTR(-ENOMEM);
+ }
+ INIT_LIST_HEAD(&ef->ei->e_top_files);
+ } else {
+ ef->ei = NULL;
+ }
+
+ ef->iop = iop;
+ ef->fop = fop;
+ ef->mode = mode;
+ ef->data = data;
+ return ef;
+}
+
+/**
+ * eventfs_create_events_dir - create the trace event structure
+ * @name: the name of the directory to create.
+ * @parent: parent dentry for this file. This should be a directory dentry
+ * if set. If this parameter is NULL, then the directory will be
+ * created in the root of the tracefs filesystem.
+ *
+ * This function creates the top of the trace event directory.
+ */
+struct dentry *eventfs_create_events_dir(const char *name,
+ struct dentry *parent)
+{
+ struct dentry *dentry = tracefs_start_creating(name, parent);
+ struct eventfs_inode *ei;
+ struct tracefs_inode *ti;
+ struct inode *inode;
+
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
+ if (IS_ERR(dentry))
+ return dentry;
+
+ ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ if (!ei)
+ return ERR_PTR(-ENOMEM);
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode)) {
+ kfree(ei);
+ tracefs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&ei->e_top_files);
+
+ ti = get_tracefs(inode);
+ ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE;
+ ti->private = ei;
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = &eventfs_root_dir_inode_operations;
+ inode->i_fop = &eventfs_file_operations;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return tracefs_end_creating(dentry);
+}
+
+/**
+ * eventfs_add_subsystem_dir - add eventfs subsystem_dir to list to create later
+ * @name: the name of the file to create.
+ * @parent: parent dentry for this dir.
+ *
+ * This function adds eventfs subsystem dir to list.
+ * And all these dirs are created on the fly when they are looked up,
+ * and the dentry and inodes will be removed when they are done.
+ */
+struct eventfs_file *eventfs_add_subsystem_dir(const char *name,
+ struct dentry *parent)
+{
+ struct tracefs_inode *ti_parent;
+ struct eventfs_inode *ei_parent;
+ struct eventfs_file *ef;
+
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
+ if (!parent)
+ return ERR_PTR(-EINVAL);
+
+ ti_parent = get_tracefs(parent->d_inode);
+ ei_parent = ti_parent->private;
+
+ ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
+ if (IS_ERR(ef))
+ return ef;
+
+ mutex_lock(&eventfs_mutex);
+ list_add_tail(&ef->list, &ei_parent->e_top_files);
+ ef->d_parent = parent;
+ mutex_unlock(&eventfs_mutex);
+ return ef;
+}
+
+/**
+ * eventfs_add_dir - add eventfs dir to list to create later
+ * @name: the name of the file to create.
+ * @ef_parent: parent eventfs_file for this dir.
+ *
+ * This function adds eventfs dir to list.
+ * And all these dirs are created on the fly when they are looked up,
+ * and the dentry and inodes will be removed when they are done.
+ */
+struct eventfs_file *eventfs_add_dir(const char *name,
+ struct eventfs_file *ef_parent)
+{
+ struct eventfs_file *ef;
+
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
+ if (!ef_parent)
+ return ERR_PTR(-EINVAL);
+
+ ef = eventfs_prepare_ef(name, S_IFDIR, NULL, NULL, NULL);
+ if (IS_ERR(ef))
+ return ef;
+
+ mutex_lock(&eventfs_mutex);
+ list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
+ ef->d_parent = ef_parent->dentry;
+ mutex_unlock(&eventfs_mutex);
+ return ef;
+}
+
+/**
+ * eventfs_add_events_file - add the data needed to create a file for later reference
+ * @name: the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: parent dentry for this file.
+ * @data: something that the caller will want to get to later on.
+ * @fop: struct file_operations that should be used for this file.
+ *
+ * This function is used to add the information needed to create a
+ * dentry/inode within the top level events directory. The file created
+ * will have the @mode permissions. The @data will be used to fill the
+ * inode.i_private when the open() call is done. The dentry and inodes are
+ * all created when they are referenced, and removed when they are no
+ * longer referenced.
+ */
+int eventfs_add_events_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fop)
+{
+ struct tracefs_inode *ti;
+ struct eventfs_inode *ei;
+ struct eventfs_file *ef;
+
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return -ENODEV;
+
+ if (!parent)
+ return -EINVAL;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+
+ if (!parent->d_inode)
+ return -EINVAL;
+
+ ti = get_tracefs(parent->d_inode);
+ if (!(ti->flags & TRACEFS_EVENT_INODE))
+ return -EINVAL;
+
+ ei = ti->private;
+ ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
+
+ if (IS_ERR(ef))
+ return -ENOMEM;
+
+ mutex_lock(&eventfs_mutex);
+ list_add_tail(&ef->list, &ei->e_top_files);
+ ef->d_parent = parent;
+ mutex_unlock(&eventfs_mutex);
+ return 0;
+}
+
+/**
+ * eventfs_add_file - add eventfs file to list to create later
+ * @name: the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @ef_parent: parent eventfs_file for this file.
+ * @data: something that the caller will want to get to later on.
+ * @fop: struct file_operations that should be used for this file.
+ *
+ * This function is used to add the information needed to create a
+ * file within a subdirectory of the events directory. The file created
+ * will have the @mode permissions. The @data will be used to fill the
+ * inode.i_private when the open() call is done. The dentry and inodes are
+ * all created when they are referenced, and removed when they are no
+ * longer referenced.
+ */
+int eventfs_add_file(const char *name, umode_t mode,
+ struct eventfs_file *ef_parent,
+ void *data,
+ const struct file_operations *fop)
+{
+ struct eventfs_file *ef;
+
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return -ENODEV;
+
+ if (!ef_parent)
+ return -EINVAL;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+
+ ef = eventfs_prepare_ef(name, mode, fop, NULL, data);
+ if (IS_ERR(ef))
+ return -ENOMEM;
+
+ mutex_lock(&eventfs_mutex);
+ list_add_tail(&ef->list, &ef_parent->ei->e_top_files);
+ ef->d_parent = ef_parent->dentry;
+ mutex_unlock(&eventfs_mutex);
+ return 0;
+}
+
+static void free_ef(struct rcu_head *head)
+{
+ struct eventfs_file *ef = container_of(head, struct eventfs_file, rcu);
+
+ kfree(ef->name);
+ kfree(ef->ei);
+ kfree(ef);
+}
+
+/**
+ * eventfs_remove_rec - remove eventfs dir or file from list
+ * @ef: eventfs_file to be removed.
+ * @head: to create list of eventfs_file to be deleted
+ * @level: to check recursion depth
+ *
+ * The helper function eventfs_remove_rec() is used to clean up and free the
+ * associated data from eventfs for both of the added functions.
+ */
+static void eventfs_remove_rec(struct eventfs_file *ef, struct list_head *head, int level)
+{
+ struct eventfs_file *ef_child;
+
+ if (!ef)
+ return;
+ /*
+ * Check recursion depth. It should never be greater than 3:
+ * 0 - events/
+ * 1 - events/group/
+ * 2 - events/group/event/
+ * 3 - events/group/event/file
+ */
+ if (WARN_ON_ONCE(level > 3))
+ return;
+
+ if (ef->ei) {
+ /* search for nested folders or files */
+ list_for_each_entry_srcu(ef_child, &ef->ei->e_top_files, list,
+ lockdep_is_held(&eventfs_mutex)) {
+ eventfs_remove_rec(ef_child, head, level + 1);
+ }
+ }
+
+ list_del_rcu(&ef->list);
+ list_add_tail(&ef->del_list, head);
+}
+
+/**
+ * eventfs_remove - remove eventfs dir or file from list
+ * @ef: eventfs_file to be removed.
+ *
+ * This function acquire the eventfs_mutex lock and call eventfs_remove_rec()
+ */
+void eventfs_remove(struct eventfs_file *ef)
+{
+ struct eventfs_file *tmp;
+ LIST_HEAD(ef_del_list);
+ struct dentry *dentry_list = NULL;
+ struct dentry *dentry;
+
+ if (!ef)
+ return;
+
+ mutex_lock(&eventfs_mutex);
+ eventfs_remove_rec(ef, &ef_del_list, 0);
+ list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) {
+ if (ef->dentry) {
+ unsigned long ptr = (unsigned long)dentry_list;
+
+ /* Keep the dentry from being freed yet */
+ dget(ef->dentry);
+
+ /*
+ * Paranoid: The dget() above should prevent the dentry
+ * from being freed and calling eventfs_set_ef_status_free().
+ * But just in case, set the link list LSB pointer to 1
+ * and have eventfs_set_ef_status_free() check that to
+ * make sure that if it does happen, it will not think
+ * the d_fsdata is an event_file.
+ *
+ * For this to work, no event_file should be allocated
+ * on a odd space, as the ef should always be allocated
+ * to be at least word aligned. Check for that too.
+ */
+ WARN_ON_ONCE(ptr & 1);
+
+ ef->dentry->d_fsdata = (void *)(ptr | 1);
+ dentry_list = ef->dentry;
+ ef->dentry = NULL;
+ }
+ call_srcu(&eventfs_srcu, &ef->rcu, free_ef);
+ }
+ mutex_unlock(&eventfs_mutex);
+
+ while (dentry_list) {
+ unsigned long ptr;
+
+ dentry = dentry_list;
+ ptr = (unsigned long)dentry->d_fsdata & ~1UL;
+ dentry_list = (struct dentry *)ptr;
+ dentry->d_fsdata = NULL;
+ d_invalidate(dentry);
+ mutex_lock(&eventfs_mutex);
+ /* dentry should now have at least a single reference */
+ WARN_ONCE((int)d_count(dentry) < 1,
+ "dentry %p less than one reference (%d) after invalidate\n",
+ dentry, d_count(dentry));
+ mutex_unlock(&eventfs_mutex);
+ dput(dentry);
+ }
+}
+
+/**
+ * eventfs_remove_events_dir - remove eventfs dir or file from list
+ * @dentry: events's dentry to be removed.
+ *
+ * This function remove events main directory
+ */
+void eventfs_remove_events_dir(struct dentry *dentry)
+{
+ struct tracefs_inode *ti;
+
+ if (!dentry || !dentry->d_inode)
+ return;
+
+ ti = get_tracefs(dentry->d_inode);
+ if (!ti || !(ti->flags & TRACEFS_EVENT_INODE))
+ return;
+
+ d_invalidate(dentry);
+ dput(dentry);
+}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 57ac8aa4a724..891653ba9cf3 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -21,13 +21,33 @@
#include <linux/parser.h>
#include <linux/magic.h>
#include <linux/slab.h>
+#include "internal.h"
#define TRACEFS_DEFAULT_MODE 0700
+static struct kmem_cache *tracefs_inode_cachep __ro_after_init;
static struct vfsmount *tracefs_mount;
static int tracefs_mount_count;
static bool tracefs_registered;
+static struct inode *tracefs_alloc_inode(struct super_block *sb)
+{
+ struct tracefs_inode *ti;
+
+ ti = kmem_cache_alloc(tracefs_inode_cachep, GFP_KERNEL);
+ if (!ti)
+ return NULL;
+
+ ti->flags = 0;
+
+ return &ti->vfs_inode;
+}
+
+static void tracefs_free_inode(struct inode *inode)
+{
+ kmem_cache_free(tracefs_inode_cachep, get_tracefs(inode));
+}
+
static ssize_t default_read_file(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -127,12 +147,12 @@ static const struct inode_operations tracefs_dir_inode_operations = {
.rmdir = tracefs_syscall_rmdir,
};
-static struct inode *tracefs_get_inode(struct super_block *sb)
+struct inode *tracefs_get_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode);
}
return inode;
}
@@ -290,6 +310,7 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
struct tracefs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = d_inode(sb->s_root);
struct tracefs_mount_opts *opts = &fsi->mount_opts;
+ umode_t tmp_mode;
/*
* On remount, only reset mode/uid/gid if they were provided as mount
@@ -297,8 +318,9 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
*/
if (!remount || opts->opts & BIT(Opt_mode)) {
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO;
+ tmp_mode |= opts->mode;
+ WRITE_ONCE(inode->i_mode, tmp_mode);
}
if (!remount || opts->opts & BIT(Opt_uid))
@@ -346,11 +368,31 @@ static int tracefs_show_options(struct seq_file *m, struct dentry *root)
}
static const struct super_operations tracefs_super_operations = {
+ .alloc_inode = tracefs_alloc_inode,
+ .free_inode = tracefs_free_inode,
+ .drop_inode = generic_delete_inode,
.statfs = simple_statfs,
.remount_fs = tracefs_remount,
.show_options = tracefs_show_options,
};
+static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
+{
+ struct tracefs_inode *ti;
+
+ if (!dentry || !inode)
+ return;
+
+ ti = get_tracefs(inode);
+ if (ti && ti->flags & TRACEFS_EVENT_INODE)
+ eventfs_set_ef_status_free(ti, dentry);
+ iput(inode);
+}
+
+static const struct dentry_operations tracefs_dentry_operations = {
+ .d_iput = tracefs_dentry_iput,
+};
+
static int trace_fill_super(struct super_block *sb, void *data, int silent)
{
static const struct tree_descr trace_files[] = {{""}};
@@ -373,6 +415,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent)
goto fail;
sb->s_op = &tracefs_super_operations;
+ sb->s_d_op = &tracefs_dentry_operations;
tracefs_apply_options(sb, false);
@@ -399,7 +442,7 @@ static struct file_system_type trace_fs_type = {
};
MODULE_ALIAS_FS("tracefs");
-static struct dentry *start_creating(const char *name, struct dentry *parent)
+struct dentry *tracefs_start_creating(const char *name, struct dentry *parent)
{
struct dentry *dentry;
int error;
@@ -437,7 +480,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
return dentry;
}
-static struct dentry *failed_creating(struct dentry *dentry)
+struct dentry *tracefs_failed_creating(struct dentry *dentry)
{
inode_unlock(d_inode(dentry->d_parent));
dput(dentry);
@@ -445,13 +488,87 @@ static struct dentry *failed_creating(struct dentry *dentry)
return NULL;
}
-static struct dentry *end_creating(struct dentry *dentry)
+struct dentry *tracefs_end_creating(struct dentry *dentry)
{
inode_unlock(d_inode(dentry->d_parent));
return dentry;
}
/**
+ * eventfs_start_creating - start the process of creating a dentry
+ * @name: Name of the file created for the dentry
+ * @parent: The parent dentry where this dentry will be created
+ *
+ * This is a simple helper function for the dynamically created eventfs
+ * files. When the directory of the eventfs files are accessed, their
+ * dentries are created on the fly. This function is used to start that
+ * process.
+ */
+struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+ int error;
+
+ error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+ &tracefs_mount_count);
+ if (error)
+ return ERR_PTR(error);
+
+ /*
+ * If the parent is not specified, we create it in the root.
+ * We need the root dentry to do this, which is in the super
+ * block. A pointer to that is in the struct vfsmount that we
+ * have around.
+ */
+ if (!parent)
+ parent = tracefs_mount->mnt_root;
+
+ if (unlikely(IS_DEADDIR(parent->d_inode)))
+ dentry = ERR_PTR(-ENOENT);
+ else
+ dentry = lookup_one_len(name, parent, strlen(name));
+
+ if (!IS_ERR(dentry) && dentry->d_inode) {
+ dput(dentry);
+ dentry = ERR_PTR(-EEXIST);
+ }
+
+ if (IS_ERR(dentry))
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+
+ return dentry;
+}
+
+/**
+ * eventfs_failed_creating - clean up a failed eventfs dentry creation
+ * @dentry: The dentry to clean up
+ *
+ * If after calling eventfs_start_creating(), a failure is detected, the
+ * resources created by eventfs_start_creating() needs to be cleaned up. In
+ * that case, this function should be called to perform that clean up.
+ */
+struct dentry *eventfs_failed_creating(struct dentry *dentry)
+{
+ dput(dentry);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ return NULL;
+}
+
+/**
+ * eventfs_end_creating - Finish the process of creating a eventfs dentry
+ * @dentry: The dentry that has successfully been created.
+ *
+ * This function is currently just a place holder to match
+ * eventfs_start_creating(). In case any synchronization needs to be added,
+ * this function will be used to implement that without having to modify
+ * the callers of eventfs_start_creating().
+ */
+struct dentry *eventfs_end_creating(struct dentry *dentry)
+{
+ return dentry;
+}
+
+/**
* tracefs_create_file - create a file in the tracefs filesystem
* @name: a pointer to a string containing the name of the file to create.
* @mode: the permission that the file should have.
@@ -490,14 +607,14 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
if (!(mode & S_IFMT))
mode |= S_IFREG;
BUG_ON(!S_ISREG(mode));
- dentry = start_creating(name, parent);
+ dentry = tracefs_start_creating(name, parent);
if (IS_ERR(dentry))
return NULL;
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return failed_creating(dentry);
+ return tracefs_failed_creating(dentry);
inode->i_mode = mode;
inode->i_fop = fops ? fops : &tracefs_file_operations;
@@ -506,13 +623,13 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
inode->i_gid = d_inode(dentry->d_parent)->i_gid;
d_instantiate(dentry, inode);
fsnotify_create(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return tracefs_end_creating(dentry);
}
static struct dentry *__create_dir(const char *name, struct dentry *parent,
const struct inode_operations *ops)
{
- struct dentry *dentry = start_creating(name, parent);
+ struct dentry *dentry = tracefs_start_creating(name, parent);
struct inode *inode;
if (IS_ERR(dentry))
@@ -520,7 +637,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return failed_creating(dentry);
+ return tracefs_failed_creating(dentry);
/* Do not set bits for OTH */
inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP;
@@ -534,7 +651,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
d_instantiate(dentry, inode);
inc_nlink(d_inode(dentry->d_parent));
fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
- return end_creating(dentry);
+ return tracefs_end_creating(dentry);
}
/**
@@ -556,6 +673,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
*/
struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
{
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
return __create_dir(name, parent, &simple_dir_inode_operations);
}
@@ -628,10 +748,26 @@ bool tracefs_initialized(void)
return tracefs_registered;
}
+static void init_once(void *foo)
+{
+ struct tracefs_inode *ti = (struct tracefs_inode *) foo;
+
+ inode_init_once(&ti->vfs_inode);
+}
+
static int __init tracefs_init(void)
{
int retval;
+ tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache",
+ sizeof(struct tracefs_inode),
+ 0, (SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT),
+ init_once);
+ if (!tracefs_inode_cachep)
+ return -ENOMEM;
+
retval = sysfs_create_mount_point(kernel_kobj, "tracing");
if (retval)
return -EINVAL;
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
new file mode 100644
index 000000000000..4f2e49e2197b
--- /dev/null
+++ b/fs/tracefs/internal.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TRACEFS_INTERNAL_H
+#define _TRACEFS_INTERNAL_H
+
+enum {
+ TRACEFS_EVENT_INODE = BIT(1),
+ TRACEFS_EVENT_TOP_INODE = BIT(2),
+};
+
+struct tracefs_inode {
+ unsigned long flags;
+ void *private;
+ struct inode vfs_inode;
+};
+
+static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
+{
+ return container_of(inode, struct tracefs_inode, vfs_inode);
+}
+
+struct dentry *tracefs_start_creating(const char *name, struct dentry *parent);
+struct dentry *tracefs_end_creating(struct dentry *dentry);
+struct dentry *tracefs_failed_creating(struct dentry *dentry);
+struct inode *tracefs_get_inode(struct super_block *sb);
+struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
+struct dentry *eventfs_failed_creating(struct dentry *dentry);
+struct dentry *eventfs_end_creating(struct dentry *dentry);
+void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+
+#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 9c9d3f0e36a4..eef9e527d9ff 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -243,8 +243,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
(unsigned int)inode->i_mtime.tv_sec,
(unsigned int)inode->i_mtime.tv_nsec);
pr_err("\tctime %u.%u\n",
- (unsigned int)inode->i_ctime.tv_sec,
- (unsigned int)inode->i_ctime.tv_nsec);
+ (unsigned int) inode_get_ctime(inode).tv_sec,
+ (unsigned int) inode_get_ctime(inode).tv_nsec);
pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum);
pr_err("\txattr_size %u\n", ui->xattr_size);
pr_err("\txattr_cnt %u\n", ui->xattr_cnt);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ef0499edc248..2f48c58d47cd 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -96,8 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
inode->i_flags |= S_NOCMTIME;
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_mtime = inode->i_atime = inode->i_ctime =
- current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_mapping->nrpages = 0;
if (!is_xattr) {
@@ -325,7 +324,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -765,10 +764,10 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
ihold(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -838,11 +837,11 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = current_time(dir);
+ inode_set_ctime_current(inode);
drop_nlink(inode);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -940,12 +939,12 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
}
lock_2_inodes(dir, inode);
- inode->i_ctime = current_time(dir);
+ inode_set_ctime_current(inode);
clear_nlink(inode);
drop_nlink(dir);
dir->i_size -= sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
if (err)
goto out_cancel;
@@ -1019,7 +1018,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inc_nlink(dir);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err) {
ubifs_err(c, "cannot create directory, error %d", err);
@@ -1110,7 +1109,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -1210,7 +1209,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir,
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
- dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode));
err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
if (err)
goto out_cancel;
@@ -1298,7 +1297,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct ubifs_budget_req wht_req;
- struct timespec64 time;
unsigned int saved_nlink;
struct fscrypt_name old_nm, new_nm;
@@ -1414,8 +1412,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
* Like most other Unix systems, set the @i_ctime for inodes on a
* rename.
*/
- time = current_time(old_dir);
- old_inode->i_ctime = time;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
/* We must adjust parent link count when renaming directories */
if (is_dir) {
@@ -1444,13 +1441,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dir->i_size -= old_sz;
ubifs_inode(old_dir)->ui_size = old_dir->i_size;
- old_dir->i_mtime = old_dir->i_ctime = time;
- new_dir->i_mtime = new_dir->i_ctime = time;
/*
* And finally, if we unlinked a direntry which happened to have the
* same name as the moved direntry, we have to decrement @i_nlink of
- * the unlinked inode and change its ctime.
+ * the unlinked inode.
*/
if (unlink) {
/*
@@ -1462,7 +1457,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
clear_nlink(new_inode);
else
drop_nlink(new_inode);
- new_inode->i_ctime = time;
} else {
new_dir->i_size += new_sz;
ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@ -1557,7 +1551,6 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
struct inode *fst_inode = d_inode(old_dentry);
struct inode *snd_inode = d_inode(new_dentry);
- struct timespec64 time;
int err;
struct fscrypt_name fst_nm, snd_nm;
@@ -1588,11 +1581,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
lock_4_inodes(old_dir, new_dir, NULL, NULL);
- time = current_time(old_dir);
- fst_inode->i_ctime = time;
- snd_inode->i_ctime = time;
- old_dir->i_mtime = old_dir->i_ctime = time;
- new_dir->i_mtime = new_dir->i_ctime = time;
+ simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
if (old_dir != new_dir) {
if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) {
@@ -1665,7 +1654,7 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path,
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
stat->blksize = UBIFS_BLOCK_SIZE;
stat->size = ui->ui_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6738fe43040b..e5382f0b2587 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1092,7 +1092,7 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
if (attr->ia_valid & ATTR_MTIME)
inode->i_mtime = attr->ia_mtime;
if (attr->ia_valid & ATTR_CTIME)
- inode->i_ctime = attr->ia_ctime;
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (attr->ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
@@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
ui->ui_size = inode->i_size;
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
/* Other attributes may be changed at the same time as well */
do_attr_changes(inode, attr);
err = ubifs_jnl_truncate(c, inode, old_size, new_size);
@@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
mutex_lock(&ui->ui_mutex);
if (attr->ia_valid & ATTR_SIZE) {
/* Truncation changes inode [mc]time */
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
/* 'truncate_setsize()' changed @i_size, update @ui_size */
ui->ui_size = inode->i_size;
}
@@ -1364,8 +1364,10 @@ out:
static inline int mctime_update_needed(const struct inode *inode,
const struct timespec64 *now)
{
+ struct timespec64 ctime = inode_get_ctime(inode);
+
if (!timespec64_equal(&inode->i_mtime, now) ||
- !timespec64_equal(&inode->i_ctime, now))
+ !timespec64_equal(&ctime, now))
return 1;
return 0;
}
@@ -1376,8 +1378,7 @@ static inline int mctime_update_needed(const struct inode *inode,
*
* This function updates time of the inode.
*/
-int ubifs_update_time(struct inode *inode, struct timespec64 *time,
- int flags)
+int ubifs_update_time(struct inode *inode, int flags)
{
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1385,21 +1386,17 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
int err, release;
- if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
- return generic_update_time(inode, time, flags);
+ if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) {
+ generic_update_time(inode, flags);
+ return 0;
+ }
err = ubifs_budget_space(c, &req);
if (err)
return err;
mutex_lock(&ui->ui_mutex);
- if (flags & S_ATIME)
- inode->i_atime = *time;
- if (flags & S_CTIME)
- inode->i_ctime = *time;
- if (flags & S_MTIME)
- inode->i_mtime = *time;
-
+ inode_update_timestamps(inode, flags);
release = ui->dirty;
__mark_inode_dirty(inode, I_DIRTY_SYNC);
mutex_unlock(&ui->ui_mutex);
@@ -1432,7 +1429,7 @@ static int update_mctime(struct inode *inode)
return err;
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
@@ -1570,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
struct ubifs_inode *ui = ubifs_inode(inode);
mutex_lock(&ui->ui_mutex);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 67c5108abd89..d79cabe193c3 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -118,7 +118,7 @@ static int setflags(struct inode *inode, int flags)
ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS);
ui->flags |= ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
release = ui->dirty;
mark_inode_dirty_sync(inode);
mutex_unlock(&ui->ui_mutex);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index dc52ac0f4a34..ffc9beee7be6 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -454,8 +454,8 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
- ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec);
- ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ino->ctime_sec = cpu_to_le64(inode_get_ctime(inode).tv_sec);
+ ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec);
ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
ino->uid = cpu_to_le32(i_uid_read(inode));
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 32cb14759796..b08fb28d16b5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -146,8 +146,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
- inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec);
- inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+ inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec),
+ le32_to_cpu(ino->ctime_nsec));
inode->i_mode = le32_to_cpu(ino->mode);
inode->i_size = le64_to_cpu(ino->size);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4c36044140e7..ebb3ad6b5e7e 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -2027,7 +2027,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr);
-int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags);
+int ubifs_update_time(struct inode *inode, int flags);
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 349228dd1191..406c82eab513 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -134,7 +134,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = current_time(host);
+ inode_set_ctime_current(host);
host_ui->xattr_cnt += 1;
host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -215,7 +215,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = current_time(host);
+ inode_set_ctime_current(host);
host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -474,7 +474,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
return err;
mutex_lock(&host_ui->ui_mutex);
- host->i_ctime = current_time(host);
+ inode_set_ctime_current(host);
host_ui->xattr_cnt -= 1;
host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 82e8bfa2dfd9..8f7ce30d47fd 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config UDF_FS
tristate "UDF file system support"
+ select BUFFER_HEAD
select CRC_ITU_T
select NLS
select LEGACY_DIRECT_IO
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1c775e072b2f..93153665eb37 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -95,7 +95,7 @@ static int udf_copy_fi(struct udf_fileident_iter *iter)
}
off = iter->pos & (blksize - 1);
- len = min_t(int, sizeof(struct fileIdentDesc), blksize - off);
+ len = min_t(u32, sizeof(struct fileIdentDesc), blksize - off);
memcpy(&iter->fi, iter->bh[0]->b_data + off, len);
if (len < sizeof(struct fileIdentDesc))
memcpy((char *)(&iter->fi) + len, iter->bh[1]->b_data,
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 243840dc83ad..0ceac4b5937c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -63,13 +63,13 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
else
end = PAGE_SIZE;
err = __block_write_begin(page, 0, end, udf_get_block);
- if (!err)
- err = block_commit_write(page, 0, end);
- if (err < 0) {
+ if (err) {
unlock_page(page);
- ret = block_page_mkwrite_return(err);
+ ret = vmf_fs_error(err);
goto out_unlock;
}
+
+ block_commit_write(page, 0, end);
out_dirty:
set_page_dirty(page);
wait_for_stable_page(page);
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 5f7ac8c84798..6b558cbbeb6b 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -100,7 +100,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
iinfo->i_crtime = inode->i_mtime;
if (unlikely(insert_inode_locked(inode) < 0)) {
make_bad_inode(inode);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 28cdfc57d946..a17a6184cc39 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -352,8 +352,6 @@ int udf_expand_file_adinicb(struct inode *inode)
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
else
iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
- /* from now on we have normal address_space methods */
- inode->i_data.a_ops = &udf_aops;
up_write(&iinfo->i_data_sem);
mark_inode_dirty(inode);
return 0;
@@ -910,7 +908,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map)
map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED;
iinfo->i_next_alloc_block = map->lblk + 1;
iinfo->i_next_alloc_goal = newblocknum + 1;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode))
udf_sync_inode(inode);
@@ -1298,7 +1296,7 @@ set_size:
goto out_unlock;
}
update_time:
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
if (IS_SYNC(inode))
udf_sync_inode(inode);
else
@@ -1329,6 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
int bs = inode->i_sb->s_blocksize;
int ret = -EIO;
uint32_t uid, gid;
+ struct timespec64 ctime;
reread:
if (iloc->partitionReferenceNum >= sbi->s_partitions) {
@@ -1507,7 +1506,8 @@ reread:
udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime);
udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime);
- udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime);
+ udf_disk_stamp_to_time(&ctime, fe->attrTime);
+ inode_set_ctime_to_ts(inode, ctime);
iinfo->i_unique = le64_to_cpu(fe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1522,7 +1522,8 @@ reread:
udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime);
udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime);
udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime);
- udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime);
+ udf_disk_stamp_to_time(&ctime, efe->attrTime);
+ inode_set_ctime_to_ts(inode, ctime);
iinfo->i_unique = le64_to_cpu(efe->uniqueID);
iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1799,7 +1800,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
- udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
+ udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode));
memset(&(fe->impIdent), 0, sizeof(struct regid));
strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1830,12 +1831,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
udf_adjust_time(iinfo, inode->i_atime);
udf_adjust_time(iinfo, inode->i_mtime);
- udf_adjust_time(iinfo, inode->i_ctime);
+ udf_adjust_time(iinfo, inode_get_ctime(inode));
udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
- udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
+ udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode));
memset(&(efe->impIdent), 0, sizeof(efe->impIdent));
strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index a95579b043ab..ae55ab8859b6 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
*(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse =
cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
udf_fiiter_write_fi(&iter, NULL);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, false, 1);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
udf_fiiter_release(&iter);
udf_add_fid_counter(dir->i_sb, true, 1);
inc_nlink(dir);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
d_instantiate_new(dentry, inode);
@@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_size = 0;
inode_dec_link_count(dir);
udf_add_fid_counter(dir->i_sb, true, -1);
- inode->i_ctime = dir->i_ctime = dir->i_mtime =
- current_time(inode);
+ dir->i_mtime = inode_set_ctime_to_ts(dir,
+ inode_set_ctime_current(inode));
mark_inode_dirty(dir);
ret = 0;
end_rmdir:
@@ -555,11 +555,11 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
set_nlink(inode, 1);
}
udf_fiiter_delete_entry(&iter);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
inode_dec_link_count(inode);
udf_add_fid_counter(dir->i_sb, false, -1);
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
ret = 0;
end_unlink:
udf_fiiter_release(&iter);
@@ -746,9 +746,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
inc_nlink(inode);
udf_add_fid_counter(dir->i_sb, false, 1);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
- dir->i_ctime = dir->i_mtime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
ihold(inode);
d_instantiate(dentry, inode);
@@ -833,7 +833,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
mark_inode_dirty(old_inode);
/*
@@ -861,13 +861,13 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
if (new_inode) {
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
inode_dec_link_count(new_inode);
udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode),
-1);
}
- old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir);
- new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir);
+ old_dir->i_mtime = inode_set_ctime_current(old_dir);
+ new_dir->i_mtime = inode_set_ctime_current(new_dir);
mark_inode_dirty(old_dir);
mark_inode_dirty(new_dir);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 779b5c2c75f6..f7eaf7b14594 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -149,7 +149,7 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap,
struct inode *inode = d_backing_inode(dentry);
struct page *page;
- generic_fillattr(&nop_mnt_idmap, inode, stat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
page = read_mapping_page(inode->i_mapping, 0, NULL);
if (IS_ERR(page))
return PTR_ERR(page);
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 6d30adb6b890..9301e7ecd092 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -2,6 +2,7 @@
config UFS_FS
tristate "UFS file system support (read only)"
depends on BLOCK
+ select BUFFER_HEAD
help
BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 379d75796a5c..fd57f03b6c93 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
ufs_commit_chunk(page, pos, len);
ufs_put_page(page);
if (update_times)
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
ufs_handle_dirsync(dir);
}
@@ -397,7 +397,7 @@ got_it:
ufs_set_de_type(sb, de, inode->i_mode);
ufs_commit_chunk(page, pos, rec_len);
- dir->i_mtime = dir->i_ctime = current_time(dir);
+ dir->i_mtime = inode_set_ctime_current(dir);
mark_inode_dirty(dir);
err = ufs_handle_dirsync(dir);
@@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
ufs_commit_chunk(page, pos, to - from);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
err = ufs_handle_dirsync(inode);
out:
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 06bd84d555bd..a1e7bd9d1f98 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -292,7 +292,7 @@ cg_found:
inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
inode->i_blocks = 0;
inode->i_generation = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
ufsi->i_flags = UFS_I(dir)->i_flags;
ufsi->i_lastfrag = 0;
ufsi->i_shadow = 0;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a4246c83a8cd..21a4779a2de5 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -296,7 +296,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
if (new)
*new = 1;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
if (IS_SYNC(inode))
ufs_sync_inode (inode);
mark_inode_dirty(inode);
@@ -378,7 +378,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block,
mark_buffer_dirty(bh);
if (IS_SYNC(inode))
sync_dirty_buffer(bh);
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
mark_inode_dirty(inode);
out:
brelse (bh);
@@ -580,11 +580,12 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size);
inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec);
- inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec);
+ inode_set_ctime(inode,
+ (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec),
+ 0);
inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec);
inode->i_mtime.tv_nsec = 0;
inode->i_atime.tv_nsec = 0;
- inode->i_ctime.tv_nsec = 0;
inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen);
ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags);
@@ -626,10 +627,10 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size);
inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime);
- inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime);
+ inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime),
+ fs32_to_cpu(sb, ufs2_inode->ui_ctimensec));
inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime);
inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec);
- inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec);
inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec);
inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks);
inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen);
@@ -726,7 +727,8 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec);
ufs_inode->ui_atime.tv_usec = 0;
- ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec);
+ ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb,
+ inode_get_ctime(inode).tv_sec);
ufs_inode->ui_ctime.tv_usec = 0;
ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec);
ufs_inode->ui_mtime.tv_usec = 0;
@@ -770,8 +772,9 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size);
ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec);
ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec);
- ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec);
- ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec);
+ ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec);
+ ufs_inode->ui_ctimensec = cpu_to_fs32(sb,
+ inode_get_ctime(inode).tv_nsec);
ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec);
ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec);
@@ -1205,7 +1208,7 @@ static int ufs_truncate(struct inode *inode, loff_t size)
truncate_setsize(inode, size);
ufs_truncate_blocks(inode);
- inode->i_mtime = inode->i_ctime = current_time(inode);
+ inode->i_mtime = inode_set_ctime_current(inode);
mark_inode_dirty(inode);
out:
UFSD("EXIT: err %d\n", err);
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 36154b5aca6d..9cad29463791 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -153,7 +153,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
struct inode *inode = d_inode(old_dentry);
int error;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
inode_inc_link_count(inode);
ihold(inode);
@@ -220,7 +220,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- inode->i_ctime = dir->i_ctime;
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
inode_dec_link_count(inode);
err = 0;
out:
@@ -282,7 +282,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (!new_de)
goto out_dir;
ufs_set_link(new_dir, new_de, new_page, old_inode, 1);
- new_inode->i_ctime = current_time(new_inode);
+ inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
@@ -298,7 +298,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = current_time(old_inode);
+ inode_set_ctime_current(old_inode);
ufs_delete_entry(old_dir, old_de, old_page);
mark_inode_dirty(old_inode);
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 4931bec1a01c..89247193d96d 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -11,12 +11,6 @@
#include <linux/fs.h>
#include "swab.h"
-
-/*
- * some useful macros
- */
-#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len))
-
/*
* functions used for retyping
*/
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 7cecd49e078b..56eaae9dac1a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -277,17 +277,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
* hugepmd ranges.
*/
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
+ struct vm_area_struct *vma = vmf->vma;
pte_t *ptep, pte;
bool ret = true;
- mmap_assert_locked(ctx->mm);
+ assert_fault_locked(vmf);
- ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma));
+ ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
if (!ptep)
goto out;
@@ -308,10 +307,8 @@ out:
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
- struct vm_area_struct *vma,
- unsigned long address,
- unsigned long flags,
- unsigned long reason)
+ struct vm_fault *vmf,
+ unsigned long reason)
{
return false; /* should never get here */
}
@@ -325,11 +322,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
* threads.
*/
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
- unsigned long address,
- unsigned long flags,
+ struct vm_fault *vmf,
unsigned long reason)
{
struct mm_struct *mm = ctx->mm;
+ unsigned long address = vmf->address;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -338,7 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
pte_t ptent;
bool ret = true;
- mmap_assert_locked(mm);
+ assert_fault_locked(vmf);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -427,20 +424,16 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
*
* We also don't do userfault handling during
* coredumping. hugetlbfs has the special
- * follow_hugetlb_page() to skip missing pages in the
+ * hugetlb_follow_page_mask() to skip missing pages in the
* FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
* the no_page_table() helper in follow_page_mask(), but the
* shmem_vm_ops->fault method is invoked even during
- * coredumping without mmap_lock and it ends up here.
+ * coredumping and it ends up here.
*/
if (current->flags & (PF_EXITING|PF_DUMPCORE))
goto out;
- /*
- * Coredumping runs without mmap_lock so we can only check that
- * the mmap_lock is held, if PF_DUMPCORE was not set.
- */
- mmap_assert_locked(mm);
+ assert_fault_locked(vmf);
ctx = vma->vm_userfaultfd_ctx.ctx;
if (!ctx)
@@ -556,15 +549,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
spin_unlock_irq(&ctx->fault_pending_wqh.lock);
if (!is_vm_hugetlb_page(vma))
- must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
- reason);
+ must_wait = userfaultfd_must_wait(ctx, vmf, reason);
else
- must_wait = userfaultfd_huge_must_wait(ctx, vma,
- vmf->address,
- vmf->flags, reason);
+ must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
if (is_vm_hugetlb_page(vma))
hugetlb_vma_unlock_read(vma);
- mmap_read_unlock(mm);
+ release_fault_lock(vmf);
if (likely(must_wait && !READ_ONCE(ctx->released))) {
wake_up_poll(&ctx->fd_wqh, EPOLLIN);
@@ -667,6 +657,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma,
vma->vm_flags & ~__VM_UFFD_FLAGS);
@@ -702,6 +693,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
return 0;
@@ -783,6 +775,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
atomic_inc(&ctx->mmap_changing);
} else {
/* Drop uffd context if remap feature not enabled */
+ vma_start_write(vma);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
}
@@ -940,6 +933,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
}
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
@@ -1289,13 +1283,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
__wake_userfault(ctx, range);
}
-static __always_inline int validate_range(struct mm_struct *mm,
- __u64 start, __u64 len)
+static __always_inline int validate_unaligned_range(
+ struct mm_struct *mm, __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- if (start & ~PAGE_MASK)
- return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
@@ -1306,9 +1298,20 @@ static __always_inline int validate_range(struct mm_struct *mm,
return -EINVAL;
if (len > task_size - start)
return -EINVAL;
+ if (start + len <= start)
+ return -EINVAL;
return 0;
}
+static __always_inline int validate_range(struct mm_struct *mm,
+ __u64 start, __u64 len)
+{
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ return validate_unaligned_range(mm, start, len);
+}
+
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
unsigned long arg)
{
@@ -1502,6 +1505,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
@@ -1685,6 +1689,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
+ vma_start_write(vma);
userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -1757,17 +1762,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
+ ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
+ uffdio_copy.len);
+ if (ret)
+ goto out;
ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
- /*
- * double check for wraparound just in case. copy_from_user()
- * will later check uffdio_copy.src + uffdio_copy.len to fit
- * in the userland range.
- */
+
ret = -EINVAL;
- if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
- goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
@@ -1927,11 +1930,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
ret = -EINVAL;
- /* double check for wraparound just in case. */
- if (uffdio_continue.range.start + uffdio_continue.range.len <=
- uffdio_continue.range.start) {
- goto out;
- }
if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
UFFDIO_CONTINUE_MODE_WP))
goto out;
@@ -1965,6 +1963,61 @@ out:
return ret;
}
+static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_poison uffdio_poison;
+ struct uffdio_poison __user *user_uffdio_poison;
+ struct userfaultfd_wake_range range;
+
+ user_uffdio_poison = (struct uffdio_poison __user *)arg;
+
+ ret = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
+ goto out;
+
+ ret = -EFAULT;
+ if (copy_from_user(&uffdio_poison, user_uffdio_poison,
+ /* don't copy the output fields */
+ sizeof(uffdio_poison) - (sizeof(__s64))))
+ goto out;
+
+ ret = validate_range(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len);
+ if (ret)
+ goto out;
+
+ ret = -EINVAL;
+ if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
+ goto out;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
+ uffdio_poison.range.len,
+ &ctx->mmap_changing, 0);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ BUG_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
+ range.start = uffdio_poison.range.start;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2066,6 +2119,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_CONTINUE:
ret = userfaultfd_continue(ctx, arg);
break;
+ case UFFDIO_POISON:
+ ret = userfaultfd_poison(ctx, arg);
+ break;
}
return ret;
}
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index dd0ae1188e87..83f20dd15522 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -128,8 +128,8 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode,
inode->i_atime = ns_to_timespec64(
info->access_time.ns_relative_to_unix_epoch);
- inode->i_ctime = ns_to_timespec64(
- info->change_time.ns_relative_to_unix_epoch);
+ inode_set_ctime_to_ts(inode,
+ ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch));
inode->i_mtime = ns_to_timespec64(
info->modification_time.ns_relative_to_unix_epoch);
return 0;
@@ -252,7 +252,7 @@ int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path,
if (err)
return err;
- generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat);
+ generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), kstat);
return 0;
}
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 49bf3a1eb2a0..d071a6e32581 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -118,16 +118,16 @@ void fsverity_free_info(struct fsverity_info *vi);
int fsverity_get_descriptor(struct inode *inode,
struct fsverity_descriptor **desc_ret);
-int __init fsverity_init_info_cache(void);
-void __init fsverity_exit_info_cache(void);
+void __init fsverity_init_info_cache(void);
/* signature.c */
#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+extern int fsverity_require_signatures;
int fsverity_verify_signature(const struct fsverity_info *vi,
const u8 *signature, size_t sig_size);
-int __init fsverity_init_signature(void);
+void __init fsverity_init_signature(void);
#else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
static inline int
fsverity_verify_signature(const struct fsverity_info *vi,
@@ -136,15 +136,13 @@ fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
-static inline int fsverity_init_signature(void)
+static inline void fsverity_init_signature(void)
{
- return 0;
}
#endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */
/* verify.c */
-int __init fsverity_init_workqueue(void);
-void __init fsverity_exit_workqueue(void);
+void __init fsverity_init_workqueue(void);
#endif /* _FSVERITY_PRIVATE_H */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index c598d2035476..6b08b1d9a7d7 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -226,6 +226,14 @@ void __init fsverity_check_hash_algs(void)
if (!alg->name)
continue;
+ /*
+ * 0 must never be allocated as an FS_VERITY_HASH_ALG_* value,
+ * as it is reserved for users that use 0 to mean unspecified or
+ * a default value. fs/verity/ itself doesn't care and doesn't
+ * have a default algorithm, but some users make use of this.
+ */
+ BUG_ON(i == 0);
+
BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE);
/*
diff --git a/fs/verity/init.c b/fs/verity/init.c
index 023905151035..a29f062f6047 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -9,6 +9,37 @@
#include <linux/ratelimit.h>
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *fsverity_sysctl_header;
+
+static struct ctl_table fsverity_sysctl_table[] = {
+#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES
+ {
+ .procname = "require_signatures",
+ .data = &fsverity_require_signatures,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
+ { }
+};
+
+static void __init fsverity_init_sysctl(void)
+{
+ fsverity_sysctl_header = register_sysctl("fs/verity",
+ fsverity_sysctl_table);
+ if (!fsverity_sysctl_header)
+ panic("fsverity sysctl registration failed");
+}
+#else /* CONFIG_SYSCTL */
+static inline void fsverity_init_sysctl(void)
+{
+}
+#endif /* !CONFIG_SYSCTL */
+
void fsverity_msg(const struct inode *inode, const char *level,
const char *fmt, ...)
{
@@ -33,28 +64,11 @@ void fsverity_msg(const struct inode *inode, const char *level,
static int __init fsverity_init(void)
{
- int err;
-
fsverity_check_hash_algs();
-
- err = fsverity_init_info_cache();
- if (err)
- return err;
-
- err = fsverity_init_workqueue();
- if (err)
- goto err_exit_info_cache;
-
- err = fsverity_init_signature();
- if (err)
- goto err_exit_workqueue;
-
+ fsverity_init_info_cache();
+ fsverity_init_workqueue();
+ fsverity_init_sysctl();
+ fsverity_init_signature();
return 0;
-
-err_exit_workqueue:
- fsverity_exit_workqueue();
-err_exit_info_cache:
- fsverity_exit_info_cache();
- return err;
}
late_initcall(fsverity_init)
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 1db5106a9c38..6c31a871b84b 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -408,18 +408,10 @@ void __fsverity_cleanup_inode(struct inode *inode)
}
EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode);
-int __init fsverity_init_info_cache(void)
+void __init fsverity_init_info_cache(void)
{
- fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info,
- SLAB_RECLAIM_ACCOUNT,
- file_digest);
- if (!fsverity_info_cachep)
- return -ENOMEM;
- return 0;
-}
-
-void __init fsverity_exit_info_cache(void)
-{
- kmem_cache_destroy(fsverity_info_cachep);
- fsverity_info_cachep = NULL;
+ fsverity_info_cachep = KMEM_CACHE_USERCOPY(
+ fsverity_info,
+ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC,
+ file_digest);
}
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index 72034bc71c9d..90c07573dd77 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -24,7 +24,7 @@
* /proc/sys/fs/verity/require_signatures
* If 1, all verity files must have a valid builtin signature.
*/
-static int fsverity_require_signatures;
+int fsverity_require_signatures;
/*
* Keyring that contains the trusted X.509 certificates.
@@ -62,6 +62,22 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
+ if (fsverity_keyring->keys.nr_leaves_on_tree == 0) {
+ /*
+ * The ".fs-verity" keyring is empty, due to builtin signatures
+ * being supported by the kernel but not actually being used.
+ * In this case, verify_pkcs7_signature() would always return an
+ * error, usually ENOKEY. It could also be EBADMSG if the
+ * PKCS#7 is malformed, but that isn't very important to
+ * distinguish. So, just skip to ENOKEY to avoid the attack
+ * surface of the PKCS#7 parser, which would otherwise be
+ * reachable by any task able to execute FS_IOC_ENABLE_VERITY.
+ */
+ fsverity_err(inode,
+ "fs-verity keyring is empty, rejecting signed file!");
+ return -ENOKEY;
+ }
+
d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
if (!d)
return -ENOMEM;
@@ -93,59 +109,14 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return 0;
}
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *fsverity_sysctl_header;
-
-static struct ctl_table fsverity_sysctl_table[] = {
- {
- .procname = "require_signatures",
- .data = &fsverity_require_signatures,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- { }
-};
-
-static int __init fsverity_sysctl_init(void)
-{
- fsverity_sysctl_header = register_sysctl("fs/verity", fsverity_sysctl_table);
- if (!fsverity_sysctl_header) {
- pr_err("sysctl registration failed!\n");
- return -ENOMEM;
- }
- return 0;
-}
-#else /* !CONFIG_SYSCTL */
-static inline int __init fsverity_sysctl_init(void)
+void __init fsverity_init_signature(void)
{
- return 0;
-}
-#endif /* !CONFIG_SYSCTL */
-
-int __init fsverity_init_signature(void)
-{
- struct key *ring;
- int err;
-
- ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0),
- current_cred(), KEY_POS_SEARCH |
+ fsverity_keyring =
+ keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(), KEY_POS_SEARCH |
KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE |
KEY_USR_SEARCH | KEY_USR_SETATTR,
- KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
- if (IS_ERR(ring))
- return PTR_ERR(ring);
-
- err = fsverity_sysctl_init();
- if (err)
- goto err_put_ring;
-
- fsverity_keyring = ring;
- return 0;
-
-err_put_ring:
- key_put(ring);
- return err;
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ if (IS_ERR(fsverity_keyring))
+ panic("failed to allocate \".fs-verity\" keyring");
}
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 433cef51f5f6..904ccd7e8e16 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -346,7 +346,7 @@ void fsverity_enqueue_verify_work(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work);
-int __init fsverity_init_workqueue(void)
+void __init fsverity_init_workqueue(void)
{
/*
* Use a high-priority workqueue to prioritize verification work, which
@@ -360,12 +360,5 @@ int __init fsverity_init_workqueue(void)
WQ_HIGHPRI,
num_online_cpus());
if (!fsverity_read_workqueue)
- return -ENOMEM;
- return 0;
-}
-
-void __init fsverity_exit_workqueue(void)
-{
- destroy_workqueue(fsverity_read_workqueue);
- fsverity_read_workqueue = NULL;
+ panic("failed to allocate fsverity_read_queue");
}
diff --git a/fs/xattr.c b/fs/xattr.c
index e7bbb7f57557..efd4736bc94b 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -1040,12 +1040,32 @@ const char *xattr_full_name(const struct xattr_handler *handler,
EXPORT_SYMBOL(xattr_full_name);
/**
- * free_simple_xattr - free an xattr object
+ * simple_xattr_space - estimate the memory used by a simple xattr
+ * @name: the full name of the xattr
+ * @size: the size of its value
+ *
+ * This takes no account of how much larger the two slab objects actually are:
+ * that would depend on the slab implementation, when what is required is a
+ * deterministic number, which grows with name length and size and quantity.
+ *
+ * Return: The approximate number of bytes of memory used by such an xattr.
+ */
+size_t simple_xattr_space(const char *name, size_t size)
+{
+ /*
+ * Use "40" instead of sizeof(struct simple_xattr), to return the
+ * same result on 32-bit and 64-bit, and even if simple_xattr grows.
+ */
+ return 40 + size + strlen(name);
+}
+
+/**
+ * simple_xattr_free - free an xattr object
* @xattr: the xattr object
*
* Free the xattr object. Can handle @xattr being NULL.
*/
-static inline void free_simple_xattr(struct simple_xattr *xattr)
+void simple_xattr_free(struct simple_xattr *xattr)
{
if (xattr)
kfree(xattr->name);
@@ -1073,7 +1093,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
if (len < sizeof(*new_xattr))
return NULL;
- new_xattr = kvmalloc(len, GFP_KERNEL);
+ new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT);
if (!new_xattr)
return NULL;
@@ -1164,7 +1184,6 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* @value: the value to store along the xattr
* @size: the size of @value
* @flags: the flags determining how to set the xattr
- * @removed_size: the size of the removed xattr
*
* Set a new xattr object.
* If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
@@ -1181,29 +1200,27 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
* XATTR_REPLACE we fail as mentioned above.
*
- * Return: On success zero and on error a negative error code is returned.
+ * Return: On success, the removed or replaced xattr is returned, to be freed
+ * by the caller; or NULL if none. On failure a negative error code is returned.
*/
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
- const void *value, size_t size, int flags,
- ssize_t *removed_size)
+struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
+ const char *name, const void *value,
+ size_t size, int flags)
{
- struct simple_xattr *xattr = NULL, *new_xattr = NULL;
+ struct simple_xattr *old_xattr = NULL, *new_xattr = NULL;
struct rb_node *parent = NULL, **rbp;
int err = 0, ret;
- if (removed_size)
- *removed_size = -1;
-
/* value == NULL means remove */
if (value) {
new_xattr = simple_xattr_alloc(value, size);
if (!new_xattr)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- new_xattr->name = kstrdup(name, GFP_KERNEL);
+ new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
if (!new_xattr->name) {
- free_simple_xattr(new_xattr);
- return -ENOMEM;
+ simple_xattr_free(new_xattr);
+ return ERR_PTR(-ENOMEM);
}
}
@@ -1217,12 +1234,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
else if (ret > 0)
rbp = &(*rbp)->rb_right;
else
- xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
- if (xattr)
+ old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
+ if (old_xattr)
break;
}
- if (xattr) {
+ if (old_xattr) {
/* Fail if XATTR_CREATE is requested and the xattr exists. */
if (flags & XATTR_CREATE) {
err = -EEXIST;
@@ -1230,12 +1247,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
}
if (new_xattr)
- rb_replace_node(&xattr->rb_node, &new_xattr->rb_node,
- &xattrs->rb_root);
+ rb_replace_node(&old_xattr->rb_node,
+ &new_xattr->rb_node, &xattrs->rb_root);
else
- rb_erase(&xattr->rb_node, &xattrs->rb_root);
- if (!err && removed_size)
- *removed_size = xattr->size;
+ rb_erase(&old_xattr->rb_node, &xattrs->rb_root);
} else {
/* Fail if XATTR_REPLACE is requested but no xattr is found. */
if (flags & XATTR_REPLACE) {
@@ -1260,12 +1275,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
out_unlock:
write_unlock(&xattrs->lock);
- if (err)
- free_simple_xattr(new_xattr);
- else
- free_simple_xattr(xattr);
- return err;
-
+ if (!err)
+ return old_xattr;
+ simple_xattr_free(new_xattr);
+ return ERR_PTR(err);
}
static bool xattr_is_trusted(const char *name)
@@ -1370,14 +1383,17 @@ void simple_xattrs_init(struct simple_xattrs *xattrs)
/**
* simple_xattrs_free - free xattrs
* @xattrs: xattr header whose xattrs to destroy
+ * @freed_space: approximate number of bytes of memory freed from @xattrs
*
* Destroy all xattrs in @xattr. When this is called no one can hold a
* reference to any of the xattrs anymore.
*/
-void simple_xattrs_free(struct simple_xattrs *xattrs)
+void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space)
{
struct rb_node *rbp;
+ if (freed_space)
+ *freed_space = 0;
rbp = rb_first(&xattrs->rb_root);
while (rbp) {
struct simple_xattr *xattr;
@@ -1386,7 +1402,10 @@ void simple_xattrs_free(struct simple_xattrs *xattrs)
rbp_next = rb_next(rbp);
xattr = rb_entry(rbp, struct simple_xattr, rb_node);
rb_erase(&xattr->rb_node, &xattrs->rb_root);
- free_simple_xattr(xattr);
+ if (freed_space)
+ *freed_space += simple_xattr_space(xattr->name,
+ xattr->size);
+ simple_xattr_free(xattr);
rbp = rbp_next;
}
}
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 52e1823241fb..ed0bc8cbc703 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -128,6 +128,7 @@ config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
depends on XFS_FS
+ depends on TMPFS && SHMEM
select XFS_DRAIN_INTENTS
help
If you say Y here you will be able to check metadata on a
@@ -142,6 +143,23 @@ config XFS_ONLINE_SCRUB
If unsure, say N.
+config XFS_ONLINE_SCRUB_STATS
+ bool "XFS online metadata check usage data collection"
+ default y
+ depends on XFS_ONLINE_SCRUB
+ select XFS_DEBUG
+ help
+ If you say Y here, the kernel will gather usage data about
+ the online metadata check subsystem. This includes the number
+ of invocations, the outcomes, and the results of repairs, if any.
+ This may slow down scrub slightly due to the use of high precision
+ timers and the need to merge per-invocation information into the
+ filesystem counters.
+
+ Usage data are collected in /sys/kernel/debug/xfs/scrub.
+
+ If unsure, say N.
+
config XFS_ONLINE_REPAIR
bool "XFS online metadata repair support"
default n
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 16e4eb431230..7762c01a85cf 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -164,15 +164,24 @@ xfs-y += $(addprefix scrub/, \
rmap.o \
scrub.o \
symlink.o \
+ xfarray.o \
+ xfile.o \
+ )
+
+xfs-$(CONFIG_XFS_ONLINE_SCRUB_STATS) += scrub/stats.o
+
+xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
+ rtbitmap.o \
+ rtsummary.o \
)
-xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o
xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
# online repair
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ reap.o \
repair.o \
)
endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index e9cc481b4ddf..f9f4d694640d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -1001,6 +1001,12 @@ xfs_ag_shrink_space(
error = -ENOSPC;
goto resv_init_out;
}
+
+ /* Update perag geometry */
+ pag->block_count -= delta;
+ __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+
xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH);
xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH);
return 0;
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2cbf9ea39b8c..6360073865db 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -743,7 +743,11 @@ struct xfs_scrub_metadata {
*/
#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
-#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
+/* i: Rebuild the data structure. */
+#define XFS_SCRUB_IFLAG_FORCE_REBUILD (1u << 8)
+
+#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR | \
+ XFS_SCRUB_IFLAG_FORCE_REBUILD)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
XFS_SCRUB_OFLAG_PREEN | \
XFS_SCRUB_OFLAG_XFAIL | \
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 758aacd8166b..a35781577cad 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -222,7 +222,8 @@ xfs_inode_from_disk(
*/
inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime);
inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime);
- inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime);
+ inode_set_ctime_to_ts(inode,
+ xfs_inode_from_disk_ts(from, from->di_ctime));
ip->i_disk_size = be64_to_cpu(from->di_size);
ip->i_nblocks = be64_to_cpu(from->di_nblocks);
@@ -316,7 +317,7 @@ xfs_inode_to_disk(
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
+ to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode));
to->di_nlink = cpu_to_be32(inode->i_nlink);
to->di_gen = cpu_to_be32(inode->i_generation);
to->di_mode = cpu_to_be16(inode->i_mode);
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 2420865f3007..a5100a11faf9 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log);
#define xlog_check_buf_cancel_table(log) do { } while (0)
#endif
+/*
+ * Transform a regular reservation into one suitable for recovery of a log
+ * intent item.
+ *
+ * Intent recovery only runs a single step of the transaction chain and defers
+ * the rest to a separate transaction. Therefore, we reduce logcount to 1 here
+ * to avoid livelocks if the log grant space is nearly exhausted due to the
+ * recovered intent pinning the tail. Keep the same logflags to avoid tripping
+ * asserts elsewhere. Struct copies abound below.
+ */
+static inline struct xfs_trans_res
+xlog_recover_resv(const struct xfs_trans_res *r)
+{
+ struct xfs_trans_res ret = {
+ .tr_logres = r->tr_logres,
+ .tr_logcount = 1,
+ .tr_logflags = r->tr_logflags,
+ };
+
+ return ret;
+}
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5e174685a77c..6264daaab37b 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -266,7 +266,8 @@ xfs_validate_sb_write(
return -EFSCORRUPTED;
}
- if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ if (!xfs_is_readonly(mp) &&
+ xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
xfs_alert(mp,
"Corruption detected in superblock read-only compatible features (0x%x)!",
(sbp->sb_features_ro_compat &
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index cb4796b6e693..6b2296ff248a 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -67,7 +67,7 @@ xfs_trans_ichgtime(
if (flags & XFS_ICHGTIME_MOD)
inode->i_mtime = tv;
if (flags & XFS_ICHGTIME_CHG)
- inode->i_ctime = tv;
+ inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
ip->i_crtime = tv;
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index bbaa65422c4f..876a2f41b063 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -26,6 +26,7 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/reap.h"
/* Superblock */
@@ -48,6 +49,10 @@ xrep_superblock(
if (error)
return error;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
/* Copy AG 0's superblock to this one. */
xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
@@ -423,6 +428,10 @@ xrep_agf(
if (error)
return error;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
/* Start rewriting the header and implant the btrees we found. */
xrep_agf_init_header(sc, agf_bp, &old_agf);
xrep_agf_set_roots(sc, agf, fab);
@@ -444,13 +453,13 @@ out_revert:
struct xrep_agfl {
/* Bitmap of alleged AGFL blocks that we're not going to add. */
- struct xbitmap crossed;
+ struct xagb_bitmap crossed;
/* Bitmap of other OWN_AG metadata blocks. */
- struct xbitmap agmetablocks;
+ struct xagb_bitmap agmetablocks;
/* Bitmap of free space. */
- struct xbitmap *freesp;
+ struct xagb_bitmap *freesp;
/* rmapbt cursor for finding crosslinked blocks */
struct xfs_btree_cur *rmap_cur;
@@ -466,7 +475,6 @@ xrep_agfl_walk_rmap(
void *priv)
{
struct xrep_agfl *ra = priv;
- xfs_fsblock_t fsb;
int error = 0;
if (xchk_should_terminate(ra->sc, &error))
@@ -474,14 +482,13 @@ xrep_agfl_walk_rmap(
/* Record all the OWN_AG blocks. */
if (rec->rm_owner == XFS_RMAP_OWN_AG) {
- fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno,
- rec->rm_startblock);
- error = xbitmap_set(ra->freesp, fsb, rec->rm_blockcount);
+ error = xagb_bitmap_set(ra->freesp, rec->rm_startblock,
+ rec->rm_blockcount);
if (error)
return error;
}
- return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
+ return xagb_bitmap_set_btcur_path(&ra->agmetablocks, cur);
}
/* Strike out the blocks that are cross-linked according to the rmapbt. */
@@ -492,12 +499,10 @@ xrep_agfl_check_extent(
void *priv)
{
struct xrep_agfl *ra = priv;
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
+ xfs_agblock_t agbno = start;
xfs_agblock_t last_agbno = agbno + len - 1;
int error;
- ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno);
-
while (agbno <= last_agbno) {
bool other_owners;
@@ -507,7 +512,7 @@ xrep_agfl_check_extent(
return error;
if (other_owners) {
- error = xbitmap_set(&ra->crossed, agbno, 1);
+ error = xagb_bitmap_set(&ra->crossed, agbno, 1);
if (error)
return error;
}
@@ -533,7 +538,7 @@ STATIC int
xrep_agfl_collect_blocks(
struct xfs_scrub *sc,
struct xfs_buf *agf_bp,
- struct xbitmap *agfl_extents,
+ struct xagb_bitmap *agfl_extents,
xfs_agblock_t *flcount)
{
struct xrep_agfl ra;
@@ -543,8 +548,8 @@ xrep_agfl_collect_blocks(
ra.sc = sc;
ra.freesp = agfl_extents;
- xbitmap_init(&ra.agmetablocks);
- xbitmap_init(&ra.crossed);
+ xagb_bitmap_init(&ra.agmetablocks);
+ xagb_bitmap_init(&ra.crossed);
/* Find all space used by the free space btrees & rmapbt. */
cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
@@ -556,7 +561,7 @@ xrep_agfl_collect_blocks(
/* Find all blocks currently being used by the bnobt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_BNO);
- error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
@@ -564,7 +569,7 @@ xrep_agfl_collect_blocks(
/* Find all blocks currently being used by the cntbt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_CNT);
- error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
+ error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
xfs_btree_del_cursor(cur, error);
if (error)
goto out_bmp;
@@ -573,17 +578,17 @@ xrep_agfl_collect_blocks(
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
- error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
+ error = xagb_bitmap_disunion(agfl_extents, &ra.agmetablocks);
if (error)
goto out_bmp;
/* Strike out the blocks that are cross-linked. */
ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
- error = xbitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
+ error = xagb_bitmap_walk(agfl_extents, xrep_agfl_check_extent, &ra);
xfs_btree_del_cursor(ra.rmap_cur, error);
if (error)
goto out_bmp;
- error = xbitmap_disunion(agfl_extents, &ra.crossed);
+ error = xagb_bitmap_disunion(agfl_extents, &ra.crossed);
if (error)
goto out_bmp;
@@ -591,12 +596,12 @@ xrep_agfl_collect_blocks(
* Calculate the new AGFL size. If we found more blocks than fit in
* the AGFL we'll free them later.
*/
- *flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
+ *flcount = min_t(uint64_t, xagb_bitmap_hweight(agfl_extents),
xfs_agfl_size(mp));
out_bmp:
- xbitmap_destroy(&ra.crossed);
- xbitmap_destroy(&ra.agmetablocks);
+ xagb_bitmap_destroy(&ra.crossed);
+ xagb_bitmap_destroy(&ra.agmetablocks);
return error;
}
@@ -615,18 +620,24 @@ xrep_agfl_update_agf(
xfs_force_summary_recalc(sc->mp);
/* Update the AGF counters. */
- if (xfs_perag_initialised_agf(sc->sa.pag))
+ if (xfs_perag_initialised_agf(sc->sa.pag)) {
sc->sa.pag->pagf_flcount = flcount;
+ clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET,
+ &sc->sa.pag->pag_opstate);
+ }
agf->agf_flfirst = cpu_to_be32(0);
agf->agf_flcount = cpu_to_be32(flcount);
- agf->agf_fllast = cpu_to_be32(flcount - 1);
+ if (flcount)
+ agf->agf_fllast = cpu_to_be32(flcount - 1);
+ else
+ agf->agf_fllast = cpu_to_be32(xfs_agfl_size(sc->mp) - 1);
xfs_alloc_log_agf(sc->tp, agf_bp,
XFS_AGF_FLFIRST | XFS_AGF_FLLAST | XFS_AGF_FLCOUNT);
}
struct xrep_agfl_fill {
- struct xbitmap used_extents;
+ struct xagb_bitmap used_extents;
struct xfs_scrub *sc;
__be32 *agfl_bno;
xfs_agblock_t flcount;
@@ -642,17 +653,15 @@ xrep_agfl_fill(
{
struct xrep_agfl_fill *af = priv;
struct xfs_scrub *sc = af->sc;
- xfs_fsblock_t fsbno = start;
+ xfs_agblock_t agbno = start;
int error;
- while (fsbno < start + len && af->fl_off < af->flcount)
- af->agfl_bno[af->fl_off++] =
- cpu_to_be32(XFS_FSB_TO_AGBNO(sc->mp, fsbno++));
+ trace_xrep_agfl_insert(sc->sa.pag, agbno, len);
- trace_xrep_agfl_insert(sc->mp, sc->sa.pag->pag_agno,
- XFS_FSB_TO_AGBNO(sc->mp, start), len);
+ while (agbno < start + len && af->fl_off < af->flcount)
+ af->agfl_bno[af->fl_off++] = cpu_to_be32(agbno++);
- error = xbitmap_set(&af->used_extents, start, fsbno - 1);
+ error = xagb_bitmap_set(&af->used_extents, start, agbno - 1);
if (error)
return error;
@@ -667,7 +676,7 @@ STATIC int
xrep_agfl_init_header(
struct xfs_scrub *sc,
struct xfs_buf *agfl_bp,
- struct xbitmap *agfl_extents,
+ struct xagb_bitmap *agfl_extents,
xfs_agblock_t flcount)
{
struct xrep_agfl_fill af = {
@@ -695,17 +704,17 @@ xrep_agfl_init_header(
* blocks than fit in the AGFL, they will be freed in a subsequent
* step.
*/
- xbitmap_init(&af.used_extents);
+ xagb_bitmap_init(&af.used_extents);
af.agfl_bno = xfs_buf_to_agfl_bno(agfl_bp),
- xbitmap_walk(agfl_extents, xrep_agfl_fill, &af);
- error = xbitmap_disunion(agfl_extents, &af.used_extents);
+ xagb_bitmap_walk(agfl_extents, xrep_agfl_fill, &af);
+ error = xagb_bitmap_disunion(agfl_extents, &af.used_extents);
if (error)
return error;
/* Write new AGFL to disk. */
xfs_trans_buf_set_type(sc->tp, agfl_bp, XFS_BLFT_AGFL_BUF);
xfs_trans_log_buf(sc->tp, agfl_bp, 0, BBTOB(agfl_bp->b_length) - 1);
- xbitmap_destroy(&af.used_extents);
+ xagb_bitmap_destroy(&af.used_extents);
return 0;
}
@@ -714,7 +723,7 @@ int
xrep_agfl(
struct xfs_scrub *sc)
{
- struct xbitmap agfl_extents;
+ struct xagb_bitmap agfl_extents;
struct xfs_mount *mp = sc->mp;
struct xfs_buf *agf_bp;
struct xfs_buf *agfl_bp;
@@ -725,7 +734,7 @@ xrep_agfl(
if (!xfs_has_rmapbt(mp))
return -EOPNOTSUPP;
- xbitmap_init(&agfl_extents);
+ xagb_bitmap_init(&agfl_extents);
/*
* Read the AGF so that we can query the rmapbt. We hope that there's
@@ -753,6 +762,10 @@ xrep_agfl(
if (error)
goto err;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err;
+
/*
* Update AGF and AGFL. We reset the global free block counter when
* we adjust the AGF flcount (which can fail) so avoid updating any
@@ -774,10 +787,10 @@ xrep_agfl(
goto err;
/* Dump any AGFL overflow. */
- error = xrep_reap_extents(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
+ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
err:
- xbitmap_destroy(&agfl_extents);
+ xagb_bitmap_destroy(&agfl_extents);
return error;
}
@@ -1000,6 +1013,10 @@ xrep_agi(
if (error)
return error;
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
/* Start rewriting the header and implant the btrees we found. */
xrep_agi_init_header(sc, agi_bp, &old_agi);
xrep_agi_set_roots(sc, agi, fab);
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 0c959be396ea..e0c89a9a0ca0 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -301,21 +301,15 @@ xagb_bitmap_set_btblocks(
* blocks going from the leaf towards the root.
*/
int
-xbitmap_set_btcur_path(
- struct xbitmap *bitmap,
+xagb_bitmap_set_btcur_path(
+ struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur)
{
- struct xfs_buf *bp;
- xfs_fsblock_t fsb;
int i;
int error;
for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
- xfs_btree_get_block(cur, i, &bp);
- if (!bp)
- continue;
- fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- error = xbitmap_set(bitmap, fsb, 1);
+ error = xagb_bitmap_visit_btblock(cur, i, bitmap);
if (error)
return error;
}
@@ -323,35 +317,6 @@ xbitmap_set_btcur_path(
return 0;
}
-/* Collect a btree's block in the bitmap. */
-STATIC int
-xbitmap_collect_btblock(
- struct xfs_btree_cur *cur,
- int level,
- void *priv)
-{
- struct xbitmap *bitmap = priv;
- struct xfs_buf *bp;
- xfs_fsblock_t fsbno;
-
- xfs_btree_get_block(cur, level, &bp);
- if (!bp)
- return 0;
-
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- return xbitmap_set(bitmap, fsbno, 1);
-}
-
-/* Walk the btree and mark the bitmap wherever a btree block is found. */
-int
-xbitmap_set_btblocks(
- struct xbitmap *bitmap,
- struct xfs_btree_cur *cur)
-{
- return xfs_btree_visit_blocks(cur, xbitmap_collect_btblock,
- XFS_BTREE_VISIT_ALL, bitmap);
-}
-
/* How many bits are set in this bitmap? */
uint64_t
xbitmap_hweight(
@@ -385,43 +350,6 @@ xbitmap_walk(
return error;
}
-struct xbitmap_walk_bits {
- xbitmap_walk_bits_fn fn;
- void *priv;
-};
-
-/* Walk all the bits in a run. */
-static int
-xbitmap_walk_bits_in_run(
- uint64_t start,
- uint64_t len,
- void *priv)
-{
- struct xbitmap_walk_bits *wb = priv;
- uint64_t i;
- int error = 0;
-
- for (i = start; i < start + len; i++) {
- error = wb->fn(i, wb->priv);
- if (error)
- break;
- }
-
- return error;
-}
-
-/* Call a function for every set bit in this bitmap. */
-int
-xbitmap_walk_bits(
- struct xbitmap *bitmap,
- xbitmap_walk_bits_fn fn,
- void *priv)
-{
- struct xbitmap_walk_bits wb = {.fn = fn, .priv = priv};
-
- return xbitmap_walk(bitmap, xbitmap_walk_bits_in_run, &wb);
-}
-
/* Does this bitmap have no bits set at all? */
bool
xbitmap_empty(
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 84981724ecaf..4fe58bad6734 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -16,10 +16,6 @@ void xbitmap_destroy(struct xbitmap *bitmap);
int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
-int xbitmap_set_btcur_path(struct xbitmap *bitmap,
- struct xfs_btree_cur *cur);
-int xbitmap_set_btblocks(struct xbitmap *bitmap,
- struct xfs_btree_cur *cur);
uint64_t xbitmap_hweight(struct xbitmap *bitmap);
/*
@@ -33,10 +29,6 @@ typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
void *priv);
-typedef int (*xbitmap_walk_bits_fn)(uint64_t bit, void *priv);
-int xbitmap_walk_bits(struct xbitmap *bitmap, xbitmap_walk_bits_fn fn,
- void *priv);
-
bool xbitmap_empty(struct xbitmap *bitmap);
bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
@@ -110,5 +102,7 @@ static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
struct xfs_btree_cur *cur);
+int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 5bf4326e9783..75588915572e 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -38,8 +38,7 @@ xchk_setup_inode_bmap(
if (error)
goto out;
- sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, XFS_IOLOCK_EXCL);
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
/*
* We don't want any ephemeral data/cow fork updates sitting around
@@ -50,8 +49,7 @@ xchk_setup_inode_bmap(
sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
- sc->ilock_flags |= XFS_MMAPLOCK_EXCL;
- xfs_ilock(sc->ip, XFS_MMAPLOCK_EXCL);
+ xchk_ilock(sc, XFS_MMAPLOCK_EXCL);
inode_dio_wait(VFS_I(sc->ip));
@@ -79,9 +77,8 @@ xchk_setup_inode_bmap(
error = xchk_trans_alloc(sc, 0);
if (error)
goto out;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode */
return error;
@@ -844,7 +841,7 @@ xchk_bmap(
/* Non-existent forks can be ignored. */
if (!ifp)
- goto out;
+ return -ENOENT;
info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
info.whichfork = whichfork;
@@ -853,10 +850,10 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
- /* No CoW forks on non-reflink inodes/filesystems. */
- if (!xfs_is_reflink_inode(ip)) {
+ /* No CoW forks on non-reflink filesystems. */
+ if (!xfs_has_reflink(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
- goto out;
+ return 0;
}
break;
case XFS_ATTR_FORK:
@@ -876,31 +873,31 @@ xchk_bmap(
/* No mappings to check. */
if (whichfork == XFS_COW_FORK)
xchk_fblock_set_corrupt(sc, whichfork, 0);
- goto out;
+ return 0;
case XFS_DINODE_FMT_EXTENTS:
break;
case XFS_DINODE_FMT_BTREE:
if (whichfork == XFS_COW_FORK) {
xchk_fblock_set_corrupt(sc, whichfork, 0);
- goto out;
+ return 0;
}
error = xchk_bmap_btree(sc, whichfork, &info);
if (error)
- goto out;
+ return error;
break;
default:
xchk_fblock_set_corrupt(sc, whichfork, 0);
- goto out;
+ return 0;
}
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
+ return 0;
/* Find the offset of the last extent in the mapping. */
error = xfs_bmap_last_offset(ip, &endoff, whichfork);
if (!xchk_fblock_process_error(sc, whichfork, 0, &error))
- goto out;
+ return error;
/*
* Scrub extent records. We use a special iterator function here that
@@ -913,12 +910,12 @@ xchk_bmap(
while (xchk_bmap_iext_iter(&info, &irec)) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- goto out;
+ return 0;
if (irec.br_startoff >= endoff) {
xchk_fblock_set_corrupt(sc, whichfork,
irec.br_startoff);
- goto out;
+ return 0;
}
if (isnullstartblock(irec.br_startblock))
@@ -931,10 +928,10 @@ xchk_bmap(
if (xchk_bmap_want_check_rmaps(&info)) {
error = xchk_bmap_check_rmaps(sc, whichfork);
if (!xchk_fblock_xref_process_error(sc, whichfork, 0, &error))
- goto out;
+ return error;
}
-out:
- return error;
+
+ return 0;
}
/* Scrub an inode's data fork. */
@@ -958,8 +955,5 @@ int
xchk_bmap_cow(
struct xfs_scrub *sc)
{
- if (!xfs_is_reflink_inode(sc->ip))
- return -ENOENT;
-
return xchk_bmap(sc, XFS_COW_FORK);
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 7a20256be969..de24532fe083 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -832,6 +832,25 @@ xchk_install_handle_inode(
}
/*
+ * Install an already-referenced inode for scrubbing. Get our own reference to
+ * the inode to make disposal simpler. The inode must not be in I_FREEING or
+ * I_WILL_FREE state!
+ */
+int
+xchk_install_live_inode(
+ struct xfs_scrub *sc,
+ struct xfs_inode *ip)
+{
+ if (!igrab(VFS_I(ip))) {
+ xchk_ino_set_corrupt(sc, ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+
+ sc->ip = ip;
+ return 0;
+}
+
+/*
* In preparation to scrub metadata structures that hang off of an inode,
* grab either the inode referenced in the scrub control structure or the
* inode passed in. If the inumber does not reference an allocated inode
@@ -854,10 +873,8 @@ xchk_iget_for_scrubbing(
ASSERT(sc->tp == NULL);
/* We want to scan the inode we already had opened. */
- if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
- sc->ip = ip_in;
- return 0;
- }
+ if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino)
+ return xchk_install_live_inode(sc, ip_in);
/* Reject internal metadata files and obviously bad inode numbers. */
if (xfs_internal_inum(mp, sc->sm->sm_ino))
@@ -1005,20 +1022,48 @@ xchk_setup_inode_contents(
return error;
/* Lock the inode so the VFS cannot touch this file. */
- sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
error = xchk_trans_alloc(sc, resblks);
if (error)
goto out;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
-
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode for us */
return error;
}
+void
+xchk_ilock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ xfs_ilock(sc->ip, ilock_flags);
+ sc->ilock_flags |= ilock_flags;
+}
+
+bool
+xchk_ilock_nowait(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ if (xfs_ilock_nowait(sc->ip, ilock_flags)) {
+ sc->ilock_flags |= ilock_flags;
+ return true;
+ }
+
+ return false;
+}
+
+void
+xchk_iunlock(
+ struct xfs_scrub *sc,
+ unsigned int ilock_flags)
+{
+ sc->ilock_flags &= ~ilock_flags;
+ xfs_iunlock(sc->ip, ilock_flags);
+}
+
/*
* Predicate that decides if we need to evaluate the cross-reference check.
* If there was an error accessing the cross-reference btree, just delete
@@ -1185,3 +1230,155 @@ xchk_fsgates_enable(
sc->flags |= scrub_fsgates;
}
+
+/*
+ * Decide if this is this a cached inode that's also allocated. The caller
+ * must hold a reference to an AG and the AGI buffer lock to prevent inodes
+ * from being allocated or freed.
+ *
+ * Look up an inode by number in the given file system. If the inode number
+ * is invalid, return -EINVAL. If the inode is not in cache, return -ENODATA.
+ * If the inode is being reclaimed, return -ENODATA because we know the inode
+ * cache cannot be updating the ondisk metadata.
+ *
+ * Otherwise, the incore inode is the one we want, and it is either live,
+ * somewhere in the inactivation machinery, or reclaimable. The inode is
+ * allocated if i_mode is nonzero. In all three cases, the cached inode will
+ * be more up to date than the ondisk inode buffer, so we must use the incore
+ * i_mode.
+ */
+int
+xchk_inode_is_allocated(
+ struct xfs_scrub *sc,
+ xfs_agino_t agino,
+ bool *inuse)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_ino_t ino;
+ struct xfs_inode *ip;
+ int error;
+
+ /* caller must hold perag reference */
+ if (pag == NULL) {
+ ASSERT(pag != NULL);
+ return -EINVAL;
+ }
+
+ /* caller must have AGI buffer */
+ if (sc->sa.agi_bp == NULL) {
+ ASSERT(sc->sa.agi_bp != NULL);
+ return -EINVAL;
+ }
+
+ /* reject inode numbers outside existing AGs */
+ ino = XFS_AGINO_TO_INO(sc->mp, pag->pag_agno, agino);
+ if (!xfs_verify_ino(mp, ino))
+ return -EINVAL;
+
+ error = -ENODATA;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+ if (!ip) {
+ /* cache miss */
+ goto out_rcu;
+ }
+
+ /*
+ * If the inode number doesn't match, the incore inode got reused
+ * during an RCU grace period and the radix tree hasn't been updated.
+ * This isn't the inode we want.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != ino)
+ goto out_skip;
+
+ trace_xchk_inode_is_allocated(ip);
+
+ /*
+ * We have an incore inode that matches the inode we want, and the
+ * caller holds the perag structure and the AGI buffer. Let's check
+ * our assumptions below:
+ */
+
+#ifdef DEBUG
+ /*
+ * (1) If the incore inode is live (i.e. referenced from the dcache),
+ * it will not be INEW, nor will it be in the inactivation or reclaim
+ * machinery. The ondisk inode had better be allocated. This is the
+ * most trivial case.
+ */
+ if (!(ip->i_flags & (XFS_NEED_INACTIVE | XFS_INEW | XFS_IRECLAIMABLE |
+ XFS_INACTIVATING))) {
+ /* live inode */
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ }
+
+ /*
+ * If the incore inode is INEW, there are several possibilities:
+ *
+ * (2) For a file that is being created, note that we allocate the
+ * ondisk inode before allocating, initializing, and adding the incore
+ * inode to the radix tree.
+ *
+ * (3) If the incore inode is being recycled, the inode has to be
+ * allocated because we don't allow freed inodes to be recycled.
+ * Recycling doesn't touch i_mode.
+ */
+ if (ip->i_flags & XFS_INEW) {
+ /* created on disk already or recycling */
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ }
+
+ /*
+ * (4) If the inode is queued for inactivation (NEED_INACTIVE) but
+ * inactivation has not started (!INACTIVATING), it is still allocated.
+ */
+ if ((ip->i_flags & XFS_NEED_INACTIVE) &&
+ !(ip->i_flags & XFS_INACTIVATING)) {
+ /* definitely before difree */
+ ASSERT(VFS_I(ip)->i_mode != 0);
+ }
+#endif
+
+ /*
+ * If the incore inode is undergoing inactivation (INACTIVATING), there
+ * are two possibilities:
+ *
+ * (5) It is before the point where it would get freed ondisk, in which
+ * case i_mode is still nonzero.
+ *
+ * (6) It has already been freed, in which case i_mode is zero.
+ *
+ * We don't take the ILOCK here, but difree and dialloc update the AGI,
+ * and we've taken the AGI buffer lock, which prevents that from
+ * happening.
+ */
+
+ /*
+ * (7) Inodes undergoing inactivation (INACTIVATING) or queued for
+ * reclaim (IRECLAIMABLE) could be allocated or free. i_mode still
+ * reflects the ondisk state.
+ */
+
+ /*
+ * (8) If the inode is in IFLUSHING, it's safe to query i_mode because
+ * the flush code uses i_mode to format the ondisk inode.
+ */
+
+ /*
+ * (9) If the inode is in IRECLAIM and was reachable via the radix
+ * tree, it still has the same i_mode as it did before it entered
+ * reclaim. The inode object is still alive because we hold the RCU
+ * read lock.
+ */
+
+ *inuse = VFS_I(ip)->i_mode != 0;
+ error = 0;
+
+out_skip:
+ spin_unlock(&ip->i_flags_lock);
+out_rcu:
+ rcu_read_unlock();
+ return error;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 791235cd9b00..cabdc0e16838 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -88,10 +88,16 @@ int xchk_setup_xattr(struct xfs_scrub *sc);
int xchk_setup_symlink(struct xfs_scrub *sc);
int xchk_setup_parent(struct xfs_scrub *sc);
#ifdef CONFIG_XFS_RT
-int xchk_setup_rt(struct xfs_scrub *sc);
+int xchk_setup_rtbitmap(struct xfs_scrub *sc);
+int xchk_setup_rtsummary(struct xfs_scrub *sc);
#else
static inline int
-xchk_setup_rt(struct xfs_scrub *sc)
+xchk_setup_rtbitmap(struct xfs_scrub *sc)
+{
+ return -ENOENT;
+}
+static inline int
+xchk_setup_rtsummary(struct xfs_scrub *sc)
{
return -ENOENT;
}
@@ -137,6 +143,12 @@ int xchk_count_rmap_ownedby_ag(struct xfs_scrub *sc, struct xfs_btree_cur *cur,
int xchk_setup_ag_btree(struct xfs_scrub *sc, bool force_log);
int xchk_iget_for_scrubbing(struct xfs_scrub *sc);
int xchk_setup_inode_contents(struct xfs_scrub *sc, unsigned int resblks);
+int xchk_install_live_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
+
+void xchk_ilock(struct xfs_scrub *sc, unsigned int ilock_flags);
+bool xchk_ilock_nowait(struct xfs_scrub *sc, unsigned int ilock_flags);
+void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
+
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
@@ -155,9 +167,29 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT);
}
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+/* Decide if a repair is required. */
+static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
+{
+ return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN);
+}
+#else
+# define xchk_needs_repair(sc) (false)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
/*
+ * Helper macros to allocate and format xfile description strings.
+ * Callers must kfree the pointer returned.
+ */
+#define xchk_xfile_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
+ (sc)->mp->m_super->s_id, ##__VA_ARGS__)
+
+/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
* the CPU hotplug lock and force an i-cache flush on all CPUs once to set it
* up, and again to tear it down. These costs add up quickly, so we only want
@@ -171,4 +203,7 @@ static inline bool xchk_need_intent_drain(struct xfs_scrub *sc)
void xchk_fsgates_enable(struct xfs_scrub *sc, unsigned int scrub_fshooks);
+int xchk_inode_is_allocated(struct xfs_scrub *sc, xfs_agino_t agino,
+ bool *inuse);
+
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index e382a35e98d8..05be757668bb 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0+
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2019-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
@@ -8,6 +8,8 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_mount.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
@@ -16,6 +18,7 @@
#include "xfs_ag.h"
#include "xfs_rtalloc.h"
#include "xfs_inode.h"
+#include "xfs_icache.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -53,6 +56,7 @@ struct xchk_fscounters {
uint64_t frextents;
unsigned long long icount_min;
unsigned long long icount_max;
+ bool frozen;
};
/*
@@ -123,6 +127,82 @@ xchk_fscount_warmup(
return error;
}
+static inline int
+xchk_fsfreeze(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ trace_xchk_fsfreeze(sc, error);
+ return error;
+}
+
+static inline int
+xchk_fsthaw(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /* This should always succeed, we have a kernel freeze */
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ trace_xchk_fsthaw(sc, error);
+ return error;
+}
+
+/*
+ * We couldn't stabilize the filesystem long enough to sample all the variables
+ * that comprise the summary counters and compare them to the percpu counters.
+ * We need to disable all writer threads, which means taking the first two
+ * freeze levels to put userspace to sleep, and the third freeze level to
+ * prevent background threads from starting new transactions. Take one level
+ * more to prevent other callers from unfreezing the filesystem while we run.
+ */
+STATIC int
+xchk_fscounters_freeze(
+ struct xfs_scrub *sc)
+{
+ struct xchk_fscounters *fsc = sc->buf;
+ int error = 0;
+
+ if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
+ sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
+ mnt_drop_write_file(sc->file);
+ }
+
+ /* Try to grab a kernel freeze. */
+ while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ delay(HZ / 10);
+ }
+ if (error)
+ return error;
+
+ fsc->frozen = true;
+ return 0;
+}
+
+/* Thaw the filesystem after checking or repairing fscounters. */
+STATIC void
+xchk_fscounters_cleanup(
+ void *buf)
+{
+ struct xchk_fscounters *fsc = buf;
+ struct xfs_scrub *sc = fsc->sc;
+ int error;
+
+ if (!fsc->frozen)
+ return;
+
+ error = xchk_fsthaw(sc);
+ if (error)
+ xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
+ else
+ fsc->frozen = false;
+}
+
int
xchk_setup_fscounters(
struct xfs_scrub *sc)
@@ -140,6 +220,7 @@ xchk_setup_fscounters(
sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
+ sc->buf_cleanup = xchk_fscounters_cleanup;
fsc = sc->buf;
fsc->sc = sc;
@@ -150,7 +231,18 @@ xchk_setup_fscounters(
if (error)
return error;
- return xchk_trans_alloc(sc, 0);
+ /*
+ * Pause all writer activity in the filesystem while we're scrubbing to
+ * reduce the likelihood of background perturbations to the counters
+ * throwing off our calculations.
+ */
+ if (sc->flags & XCHK_TRY_HARDER) {
+ error = xchk_fscounters_freeze(sc);
+ if (error)
+ return error;
+ }
+
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
}
/*
@@ -290,8 +382,7 @@ retry:
if (fsc->ifree > fsc->icount) {
if (tries--)
goto retry;
- xchk_set_incomplete(sc);
- return 0;
+ return -EDEADLOCK;
}
return 0;
@@ -367,6 +458,8 @@ xchk_fscount_count_frextents(
* Otherwise, we /might/ have a problem. If the change in the summations is
* more than we want to tolerate, the filesystem is probably busy and we should
* just send back INCOMPLETE and see if userspace will try again.
+ *
+ * If we're repairing then we require an exact match.
*/
static inline bool
xchk_fscount_within_range(
@@ -396,21 +489,7 @@ xchk_fscount_within_range(
if (expected >= min_value && expected <= max_value)
return true;
- /*
- * If the difference between the two summations is too large, the fs
- * might just be busy and so we'll mark the scrub incomplete. Return
- * true here so that we don't mark the counter corrupt.
- *
- * XXX: In the future when userspace can grant scrub permission to
- * quiesce the filesystem to solve the outsized variance problem, this
- * check should be moved up and the return code changed to signal to
- * userspace that we need quiesce permission.
- */
- if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
- xchk_set_incomplete(sc);
- return true;
- }
-
+ /* Everything else is bad. */
return false;
}
@@ -422,6 +501,7 @@ xchk_fscounters(
struct xfs_mount *mp = sc->mp;
struct xchk_fscounters *fsc = sc->buf;
int64_t icount, ifree, fdblocks, frextents;
+ bool try_again = false;
int error;
/* Snapshot the percpu counters. */
@@ -431,9 +511,26 @@ xchk_fscounters(
frextents = percpu_counter_sum(&mp->m_frextents);
/* No negative values, please! */
- if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
+ if (icount < 0 || ifree < 0)
xchk_set_corrupt(sc);
+ /*
+ * If the filesystem is not frozen, the counter summation calls above
+ * can race with xfs_mod_freecounter, which subtracts a requested space
+ * reservation from the counter and undoes the subtraction if that made
+ * the counter go negative. Therefore, it's possible to see negative
+ * values here, and we should only flag that as a corruption if we
+ * froze the fs. This is much more likely to happen with frextents
+ * since there are no reserved pools.
+ */
+ if (fdblocks < 0 || frextents < 0) {
+ if (!fsc->frozen)
+ return -EDEADLOCK;
+
+ xchk_set_corrupt(sc);
+ return 0;
+ }
+
/* See if icount is obviously wrong. */
if (icount < fsc->icount_min || icount > fsc->icount_max)
xchk_set_corrupt(sc);
@@ -447,12 +544,6 @@ xchk_fscounters(
xchk_set_corrupt(sc);
/*
- * XXX: We can't quiesce percpu counter updates, so exit early.
- * This can be re-enabled when we gain exclusive freeze functionality.
- */
- return 0;
-
- /*
* If ifree exceeds icount by more than the minimum variance then
* something's probably wrong with the counters.
*/
@@ -463,8 +554,6 @@ xchk_fscounters(
error = xchk_fscount_aggregate_agcounts(sc, fsc);
if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
- return 0;
/* Count the free extents counter for rt volumes. */
error = xchk_fscount_count_frextents(sc, fsc);
@@ -473,20 +562,45 @@ xchk_fscounters(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
return 0;
- /* Compare the in-core counters with whatever we counted. */
- if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
- xchk_set_corrupt(sc);
+ /*
+ * Compare the in-core counters with whatever we counted. If the fs is
+ * frozen, we treat the discrepancy as a corruption because the freeze
+ * should have stabilized the counter values. Otherwise, we need
+ * userspace to call us back having granted us freeze permission.
+ */
+ if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
+ fsc->icount)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
- if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
- xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks))
- xchk_set_corrupt(sc);
+ fsc->fdblocks)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
- fsc->frextents))
- xchk_set_corrupt(sc);
+ fsc->frextents)) {
+ if (fsc->frozen)
+ xchk_set_corrupt(sc);
+ else
+ try_again = true;
+ }
+
+ if (try_again)
+ return -EDEADLOCK;
return 0;
}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index d2b2a1cb6533..5e2b09ed6e29 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -226,6 +226,16 @@ xchk_ag_btree_healthy_enough(
return true;
}
+ /*
+ * If we just repaired some AG metadata, sc->sick_mask will reflect all
+ * the per-AG metadata types that were repaired. Exclude these from
+ * the filesystem health query because we have not yet updated the
+ * health status and we want everything to be scanned.
+ */
+ if ((sc->flags & XREP_ALREADY_FIXED) &&
+ type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
+ mask &= ~sc->sick_mask;
+
if (xfs_ag_has_sickness(pag, mask)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
return false;
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 575f22a02ebe..fb7bbf47ae5d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -328,8 +328,7 @@ xchk_iallocbt_check_cluster_ifree(
goto out;
}
- error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp, fsino,
- &ino_inuse);
+ error = xchk_inode_is_allocated(bs->sc, agino, &ino_inuse);
if (error == -ENODATA) {
/* Not cached, just read the disk buffer */
freemask_ok = irec_free ^ !!(dip->di_mode);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 3e1e02e340a6..59d7912fb75f 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -32,15 +32,13 @@ xchk_prepare_iscrub(
{
int error;
- sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
error = xchk_trans_alloc(sc, 0);
if (error)
return error;
- sc->ilock_flags |= XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
return 0;
}
@@ -83,7 +81,10 @@ xchk_setup_inode(
/* We want to scan the opened inode, so lock it and exit. */
if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) {
- sc->ip = ip_in;
+ error = xchk_install_live_inode(sc, ip_in);
+ if (error)
+ return error;
+
return xchk_prepare_iscrub(sc);
}
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 58d5dfb7ea21..e6155d86f791 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -150,8 +150,8 @@ xchk_parent_validate(
lock_mode = xchk_parent_ilock_dir(dp);
if (!lock_mode) {
- xfs_iunlock(sc->ip, XFS_ILOCK_EXCL);
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
error = -EAGAIN;
goto out_rele;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index e6caa358cbda..5671c8153433 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -59,9 +59,12 @@ xchk_setup_quota(
error = xchk_setup_fs(sc);
if (error)
return error;
- sc->ip = xfs_quota_inode(sc->mp, dqtype);
- xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
- sc->ilock_flags = XFS_ILOCK_EXCL;
+
+ error = xchk_install_live_inode(sc, xfs_quota_inode(sc->mp, dqtype));
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
return 0;
}
@@ -235,13 +238,11 @@ xchk_quota(
* data fork we have to drop ILOCK_EXCL to use the regular dquot
* functions.
*/
- xfs_iunlock(sc->ip, sc->ilock_flags);
- sc->ilock_flags = 0;
+ xchk_iunlock(sc, sc->ilock_flags);
sqi.sc = sc;
sqi.last_id = 0;
error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
- sc->ilock_flags = XFS_ILOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
if (error == -ECANCELED)
error = 0;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
new file mode 100644
index 000000000000..86a62420e02c
--- /dev/null
+++ b/fs/xfs/scrub/reap.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag.h"
+#include "xfs_ag_resv.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_bmap.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_remote.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/reap.h"
+
+/*
+ * Disposal of Blocks from Old Metadata
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (bitmap) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from bitmap. In theory the extents
+ * remaining in bitmap are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk. The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked. Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space. Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block. If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok. For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @bitmap,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal. If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking. We must also invalidate any buffers associated with
+ * @bitmap.
+ */
+
+/* Information about reaping extents after a repair. */
+struct xreap_state {
+ struct xfs_scrub *sc;
+
+ /* Reverse mapping owner and metadata reservation type. */
+ const struct xfs_owner_info *oinfo;
+ enum xfs_ag_resv_type resv;
+
+ /* If true, roll the transaction before reaping the next extent. */
+ bool force_roll;
+
+ /* Number of deferred reaps attached to the current transaction. */
+ unsigned int deferred;
+
+ /* Number of invalidated buffers logged to the current transaction. */
+ unsigned int invalidated;
+
+ /* Number of deferred reaps queued during the whole reap sequence. */
+ unsigned long long total_deferred;
+};
+
+/* Put a block back on the AGFL. */
+STATIC int
+xreap_put_freelist(
+ struct xfs_scrub *sc,
+ xfs_agblock_t agbno)
+{
+ struct xfs_buf *agfl_bp;
+ int error;
+
+ /* Make sure there's space on the freelist. */
+ error = xrep_fix_freelist(sc, true);
+ if (error)
+ return error;
+
+ /*
+ * Since we're "freeing" a lost block onto the AGFL, we have to
+ * create an rmap for the block prior to merging it or else other
+ * parts will break.
+ */
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
+ &XFS_RMAP_OINFO_AG);
+ if (error)
+ return error;
+
+ /* Put the block on the AGFL. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
+ agfl_bp, agbno, 0);
+ if (error)
+ return error;
+ xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+ return 0;
+}
+
+/* Are there any uncommitted reap operations? */
+static inline bool xreap_dirty(const struct xreap_state *rs)
+{
+ if (rs->force_roll)
+ return true;
+ if (rs->deferred)
+ return true;
+ if (rs->invalidated)
+ return true;
+ if (rs->total_deferred)
+ return true;
+ return false;
+}
+
+#define XREAP_MAX_BINVAL (2048)
+
+/*
+ * Decide if we want to roll the transaction after reaping an extent. We don't
+ * want to overrun the transaction reservation, so we prohibit more than
+ * 128 EFIs per transaction. For the same reason, we limit the number
+ * of buffer invalidations to 2048.
+ */
+static inline bool xreap_want_roll(const struct xreap_state *rs)
+{
+ if (rs->force_roll)
+ return true;
+ if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
+ return true;
+ if (rs->invalidated > XREAP_MAX_BINVAL)
+ return true;
+ return false;
+}
+
+static inline void xreap_reset(struct xreap_state *rs)
+{
+ rs->total_deferred += rs->deferred;
+ rs->deferred = 0;
+ rs->invalidated = 0;
+ rs->force_roll = false;
+}
+
+#define XREAP_MAX_DEFER_CHAIN (2048)
+
+/*
+ * Decide if we want to finish the deferred ops that are attached to the scrub
+ * transaction. We don't want to queue huge chains of deferred ops because
+ * that can consume a lot of log space and kernel memory. Hence we trigger a
+ * xfs_defer_finish if there are more than 2048 deferred reap operations or the
+ * caller did some real work.
+ */
+static inline bool
+xreap_want_defer_finish(const struct xreap_state *rs)
+{
+ if (rs->force_roll)
+ return true;
+ if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
+ return true;
+ return false;
+}
+
+static inline void xreap_defer_finish_reset(struct xreap_state *rs)
+{
+ rs->total_deferred = 0;
+ rs->deferred = 0;
+ rs->invalidated = 0;
+ rs->force_roll = false;
+}
+
+/* Try to invalidate the incore buffers for an extent that we're freeing. */
+STATIC void
+xreap_agextent_binval(
+ struct xreap_state *rs,
+ xfs_agblock_t agbno,
+ xfs_extlen_t *aglenp)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agnumber_t agno = sc->sa.pag->pag_agno;
+ xfs_agblock_t agbno_next = agbno + *aglenp;
+ xfs_agblock_t bno = agbno;
+
+ /*
+ * Avoid invalidating AG headers and post-EOFS blocks because we never
+ * own those.
+ */
+ if (!xfs_verify_agbno(pag, agbno) ||
+ !xfs_verify_agbno(pag, agbno_next - 1))
+ return;
+
+ /*
+ * If there are incore buffers for these blocks, invalidate them. We
+ * assume that the lack of any other known owners means that the buffer
+ * can be locked without risk of deadlocking. The buffer cache cannot
+ * detect aliasing, so employ nested loops to scan for incore buffers
+ * of any plausible size.
+ */
+ while (bno < agbno_next) {
+ xfs_agblock_t fsbcount;
+ xfs_agblock_t max_fsbs;
+
+ /*
+ * Max buffer size is the max remote xattr buffer size, which
+ * is one fs block larger than 64k.
+ */
+ max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
+ xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
+
+ for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
+ struct xfs_buf *bp = NULL;
+ xfs_daddr_t daddr;
+ int error;
+
+ daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
+ error = xfs_buf_incore(mp->m_ddev_targp, daddr,
+ XFS_FSB_TO_BB(mp, fsbcount),
+ XBF_LIVESCAN, &bp);
+ if (error)
+ continue;
+
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ rs->invalidated++;
+
+ /*
+ * Stop invalidating if we've hit the limit; we should
+ * still have enough reservation left to free however
+ * far we've gotten.
+ */
+ if (rs->invalidated > XREAP_MAX_BINVAL) {
+ *aglenp -= agbno_next - bno;
+ goto out;
+ }
+ }
+
+ bno++;
+ }
+
+out:
+ trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
+}
+
+/*
+ * Figure out the longest run of blocks that we can dispose of with a single
+ * call. Cross-linked blocks should have their reverse mappings removed, but
+ * single-owner extents can be freed. AGFL blocks can only be put back one at
+ * a time.
+ */
+STATIC int
+xreap_agextent_select(
+ struct xreap_state *rs,
+ xfs_agblock_t agbno,
+ xfs_agblock_t agbno_next,
+ bool *crosslinked,
+ xfs_extlen_t *aglenp)
+{
+ struct xfs_scrub *sc = rs->sc;
+ struct xfs_btree_cur *cur;
+ xfs_agblock_t bno = agbno + 1;
+ xfs_extlen_t len = 1;
+ int error;
+
+ /*
+ * Determine if there are any other rmap records covering the first
+ * block of this extent. If so, the block is crosslinked.
+ */
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+ sc->sa.pag);
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
+ crosslinked);
+ if (error)
+ goto out_cur;
+
+ /* AGFL blocks can only be deal with one at a time. */
+ if (rs->resv == XFS_AG_RESV_AGFL)
+ goto out_found;
+
+ /*
+ * Figure out how many of the subsequent blocks have the same crosslink
+ * status.
+ */
+ while (bno < agbno_next) {
+ bool also_crosslinked;
+
+ error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
+ &also_crosslinked);
+ if (error)
+ goto out_cur;
+
+ if (*crosslinked != also_crosslinked)
+ break;
+
+ len++;
+ bno++;
+ }
+
+out_found:
+ *aglenp = len;
+ trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
+out_cur:
+ xfs_btree_del_cursor(cur, error);
+ return error;
+}
+
+/*
+ * Dispose of as much of the beginning of this AG extent as possible. The
+ * number of blocks disposed of will be returned in @aglenp.
+ */
+STATIC int
+xreap_agextent_iter(
+ struct xreap_state *rs,
+ xfs_agblock_t agbno,
+ xfs_extlen_t *aglenp,
+ bool crosslinked)
+{
+ struct xfs_scrub *sc = rs->sc;
+ xfs_fsblock_t fsbno;
+ int error = 0;
+
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the reverse mapping and move on. Otherwise,
+ * we were the only owner of the block, so free the extent, which will
+ * also remove the rmap.
+ *
+ * XXX: XFS doesn't support detecting the case where a single block
+ * metadata structure is crosslinked with a multi-block structure
+ * because the buffer cache doesn't detect aliasing problems, so we
+ * can't fix 100% of crosslinking problems (yet). The verifiers will
+ * blow on writeout, the filesystem will shut down, and the admin gets
+ * to run xfs_repair.
+ */
+ if (crosslinked) {
+ trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
+
+ rs->force_roll = true;
+ return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
+ *aglenp, rs->oinfo);
+ }
+
+ trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
+
+ /*
+ * Invalidate as many buffers as we can, starting at agbno. If this
+ * function sets *aglenp to zero, the transaction is full of logged
+ * buffer invalidations, so we need to return early so that we can
+ * roll and retry.
+ */
+ xreap_agextent_binval(rs, agbno, aglenp);
+ if (*aglenp == 0) {
+ ASSERT(xreap_want_roll(rs));
+ return 0;
+ }
+
+ /* Put blocks back on the AGFL one at a time. */
+ if (rs->resv == XFS_AG_RESV_AGFL) {
+ ASSERT(*aglenp == 1);
+ error = xreap_put_freelist(sc, agbno);
+ if (error)
+ return error;
+
+ rs->force_roll = true;
+ return 0;
+ }
+
+ /*
+ * Use deferred frees to get rid of the old btree blocks to try to
+ * minimize the window in which we could crash and lose the old blocks.
+ */
+ error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
+ rs->resv, true);
+ if (error)
+ return error;
+
+ rs->deferred++;
+ return 0;
+}
+
+/*
+ * Break an AG metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately.
+ */
+STATIC int
+xreap_agmeta_extent(
+ uint64_t fsbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_agblock_t agbno = fsbno;
+ xfs_agblock_t agbno_next = agbno + len;
+ int error = 0;
+
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+ ASSERT(sc->ip == NULL);
+
+ while (agbno < agbno_next) {
+ xfs_extlen_t aglen;
+ bool crosslinked;
+
+ error = xreap_agextent_select(rs, agbno, agbno_next,
+ &crosslinked, &aglen);
+ if (error)
+ return error;
+
+ error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
+ if (error)
+ return error;
+
+ if (xreap_want_defer_finish(rs)) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ error = xrep_roll_ag_trans(sc);
+ if (error)
+ return error;
+ xreap_reset(rs);
+ }
+
+ agbno += aglen;
+ }
+
+ return 0;
+}
+
+/* Dispose of every block of every AG metadata extent in the bitmap. */
+int
+xrep_reap_agblocks(
+ struct xfs_scrub *sc,
+ struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = type,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip == NULL);
+
+ error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
new file mode 100644
index 000000000000..fe24626af164
--- /dev/null
+++ b/fs/xfs/scrub/reap.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_REAP_H__
+#define __XFS_SCRUB_REAP_H__
+
+int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+
+#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index ac6d8803e660..1b8b5439f2d7 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -26,11 +26,13 @@
#include "xfs_ag_resv.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/stats.h"
/*
* Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -39,8 +41,10 @@
*/
int
xrep_attempt(
- struct xfs_scrub *sc)
+ struct xfs_scrub *sc,
+ struct xchk_stats_run *run)
{
+ u64 repair_start;
int error = 0;
trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
@@ -49,8 +53,11 @@ xrep_attempt(
/* Repair whatever's broken. */
ASSERT(sc->ops->repair);
+ run->repair_attempted = true;
+ repair_start = xchk_stats_now();
error = sc->ops->repair(sc);
trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
+ run->repair_ns += xchk_stats_elapsed_ns(repair_start);
switch (error) {
case 0:
/*
@@ -59,14 +66,17 @@ xrep_attempt(
*/
sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
sc->flags |= XREP_ALREADY_FIXED;
+ run->repair_succeeded = true;
return -EAGAIN;
case -ECHRNG:
sc->flags |= XCHK_NEED_DRAIN;
+ run->retries++;
return -EAGAIN;
case -EDEADLOCK:
/* Tell the caller to try again having grabbed all the locks. */
if (!(sc->flags & XCHK_TRY_HARDER)) {
sc->flags |= XCHK_TRY_HARDER;
+ run->retries++;
return -EAGAIN;
}
/*
@@ -166,6 +176,56 @@ xrep_roll_ag_trans(
return 0;
}
+/* Finish all deferred work attached to the repair transaction. */
+int
+xrep_defer_finish(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ /*
+ * Keep the AG header buffers locked while we complete deferred work
+ * items. Ensure that both AG buffers are dirty and held when we roll
+ * the transaction so that they move forward in the log without losing
+ * the bli (and hence the bli type) when the transaction commits.
+ *
+ * Normal code would never hold clean buffers across a roll, but repair
+ * needs both buffers to maintain a total lock on the AG.
+ */
+ if (sc->sa.agi_bp) {
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
+ xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+ }
+
+ if (sc->sa.agf_bp) {
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ }
+
+ /*
+ * Finish all deferred work items. We still hold the AG header buffers
+ * locked regardless of whether or not that succeeds. On failure, the
+ * buffers will be released during teardown on our way out of the
+ * kernel. If successful, join the buffers to the new transaction
+ * and move on.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /*
+ * Release the hold that we set above because defer_finish won't do
+ * that for us. The defer roll code redirties held buffers after each
+ * roll, so the AG header buffers should be ready for logging.
+ */
+ if (sc->sa.agi_bp)
+ xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+ if (sc->sa.agf_bp)
+ xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+
+ return 0;
+}
+
/*
* Does the given AG have enough space to rebuild a btree? Neither AG
* reservation can be critical, and we must have enough space (factoring
@@ -297,89 +357,6 @@ xrep_calc_ag_resblks(
return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
}
-/* Allocate a block in an AG. */
-int
-xrep_alloc_ag_block(
- struct xfs_scrub *sc,
- const struct xfs_owner_info *oinfo,
- xfs_fsblock_t *fsbno,
- enum xfs_ag_resv_type resv)
-{
- struct xfs_alloc_arg args = {0};
- xfs_agblock_t bno;
- int error;
-
- switch (resv) {
- case XFS_AG_RESV_AGFL:
- case XFS_AG_RESV_RMAPBT:
- error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
- sc->sa.agf_bp, &bno, 1);
- if (error)
- return error;
- if (bno == NULLAGBLOCK)
- return -ENOSPC;
- xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
- *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
- if (resv == XFS_AG_RESV_RMAPBT)
- xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
- return 0;
- default:
- break;
- }
-
- args.tp = sc->tp;
- args.mp = sc->mp;
- args.pag = sc->sa.pag;
- args.oinfo = *oinfo;
- args.minlen = 1;
- args.maxlen = 1;
- args.prod = 1;
- args.resv = resv;
-
- error = xfs_alloc_vextent_this_ag(&args, sc->sa.pag->pag_agno);
- if (error)
- return error;
- if (args.fsbno == NULLFSBLOCK)
- return -ENOSPC;
- ASSERT(args.len == 1);
- *fsbno = args.fsbno;
-
- return 0;
-}
-
-/* Initialize a new AG btree root block with zero entries. */
-int
-xrep_init_btblock(
- struct xfs_scrub *sc,
- xfs_fsblock_t fsb,
- struct xfs_buf **bpp,
- xfs_btnum_t btnum,
- const struct xfs_buf_ops *ops)
-{
- struct xfs_trans *tp = sc->tp;
- struct xfs_mount *mp = sc->mp;
- struct xfs_buf *bp;
- int error;
-
- trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
- XFS_FSB_TO_AGBNO(mp, fsb), btnum);
-
- ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
- error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
- &bp);
- if (error)
- return error;
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
- xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
- bp->b_ops = ops;
- *bpp = bp;
-
- return 0;
-}
-
/*
* Reconstructing per-AG Btrees
*
@@ -404,91 +381,8 @@ xrep_init_btblock(
* sublist. As with the other btrees we subtract sublist from bitmap, and the
* result (since the rmapbt lives in the free space) are the blocks from the
* old rmapbt.
- *
- * Disposal of Blocks from Old per-AG Btrees
- *
- * Now that we've constructed a new btree to replace the damaged one, we want
- * to dispose of the blocks that (we think) the old btree was using.
- * Previously, we used the rmapbt to collect the extents (bitmap) with the
- * rmap owner corresponding to the tree we rebuilt, collected extents for any
- * blocks with the same rmap owner that are owned by another data structure
- * (sublist), and subtracted sublist from bitmap. In theory the extents
- * remaining in bitmap are the old btree's blocks.
- *
- * Unfortunately, it's possible that the btree was crosslinked with other
- * blocks on disk. The rmap data can tell us if there are multiple owners, so
- * if the rmapbt says there is an owner of this block other than @oinfo, then
- * the block is crosslinked. Remove the reverse mapping and continue.
- *
- * If there is one rmap record, we can free the block, which removes the
- * reverse mapping but doesn't add the block to the free space. Our repair
- * strategy is to hope the other metadata objects crosslinked on this block
- * will be rebuilt (atop different blocks), thereby removing all the cross
- * links.
- *
- * If there are no rmap records at all, we also free the block. If the btree
- * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
- * supposed to be a rmap record and everything is ok. For other btrees there
- * had to have been an rmap entry for the block to have ended up on @bitmap,
- * so if it's gone now there's something wrong and the fs will shut down.
- *
- * Note: If there are multiple rmap records with only the same rmap owner as
- * the btree we're trying to rebuild and the block is indeed owned by another
- * data structure with the same rmap owner, then the block will be in sublist
- * and therefore doesn't need disposal. If there are multiple rmap records
- * with only the same rmap owner but the block is not owned by something with
- * the same rmap owner, the block will be freed.
- *
- * The caller is responsible for locking the AG headers for the entire rebuild
- * operation so that nothing else can sneak in and change the AG state while
- * we're not looking. We also assume that the caller already invalidated any
- * buffers associated with @bitmap.
*/
-static int
-xrep_invalidate_block(
- uint64_t fsbno,
- void *priv)
-{
- struct xfs_scrub *sc = priv;
- struct xfs_buf *bp;
- int error;
-
- /* Skip AG headers and post-EOFS blocks */
- if (!xfs_verify_fsbno(sc->mp, fsbno))
- return 0;
-
- error = xfs_buf_incore(sc->mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(sc->mp, fsbno),
- XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
- if (error)
- return 0;
-
- xfs_trans_bjoin(sc->tp, bp);
- xfs_trans_binval(sc->tp, bp);
- return 0;
-}
-
-/*
- * Invalidate buffers for per-AG btree blocks we're dumping. This function
- * is not intended for use with file data repairs; we have bunmapi for that.
- */
-int
-xrep_invalidate_blocks(
- struct xfs_scrub *sc,
- struct xbitmap *bitmap)
-{
- /*
- * For each block in each extent, see if there's an incore buffer for
- * exactly that block; if so, invalidate it. The buffer cache only
- * lets us look for one buffer at a time, so we have to look one block
- * at a time. Avoid invalidating AG headers and post-EOFS blocks
- * because we never own those; and if we can't TRYLOCK the buffer we
- * assume it's owned by someone else.
- */
- return xbitmap_walk_bits(bitmap, xrep_invalidate_block, sc);
-}
-
/* Ensure the freelist is the correct size. */
int
xrep_fix_freelist(
@@ -507,155 +401,6 @@ xrep_fix_freelist(
can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
}
-/* Information about reaping extents after a repair. */
-struct xrep_reap_state {
- struct xfs_scrub *sc;
-
- /* Reverse mapping owner and metadata reservation type. */
- const struct xfs_owner_info *oinfo;
- enum xfs_ag_resv_type resv;
-};
-
-/*
- * Put a block back on the AGFL.
- */
-STATIC int
-xrep_put_freelist(
- struct xfs_scrub *sc,
- xfs_agblock_t agbno)
-{
- struct xfs_buf *agfl_bp;
- int error;
-
- /* Make sure there's space on the freelist. */
- error = xrep_fix_freelist(sc, true);
- if (error)
- return error;
-
- /*
- * Since we're "freeing" a lost block onto the AGFL, we have to
- * create an rmap for the block prior to merging it or else other
- * parts will break.
- */
- error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
- &XFS_RMAP_OINFO_AG);
- if (error)
- return error;
-
- /* Put the block on the AGFL. */
- error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
- if (error)
- return error;
-
- error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
- agfl_bp, agbno, 0);
- if (error)
- return error;
- xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
- XFS_EXTENT_BUSY_SKIP_DISCARD);
-
- return 0;
-}
-
-/* Dispose of a single block. */
-STATIC int
-xrep_reap_block(
- uint64_t fsbno,
- void *priv)
-{
- struct xrep_reap_state *rs = priv;
- struct xfs_scrub *sc = rs->sc;
- struct xfs_btree_cur *cur;
- struct xfs_buf *agf_bp = NULL;
- xfs_agblock_t agbno;
- bool has_other_rmap;
- int error;
-
- ASSERT(sc->ip != NULL ||
- XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
- trace_xrep_dispose_btree_extent(sc->mp,
- XFS_FSB_TO_AGNO(sc->mp, fsbno),
- XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
-
- agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
- ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
-
- /*
- * If we are repairing per-inode metadata, we need to read in the AGF
- * buffer. Otherwise, we're repairing a per-AG structure, so reuse
- * the AGF buffer that the setup functions already grabbed.
- */
- if (sc->ip) {
- error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
- if (error)
- return error;
- } else {
- agf_bp = sc->sa.agf_bp;
- }
- cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
-
- /* Can we find any other rmappings? */
- error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
- &has_other_rmap);
- xfs_btree_del_cursor(cur, error);
- if (error)
- goto out_free;
-
- /*
- * If there are other rmappings, this block is cross linked and must
- * not be freed. Remove the reverse mapping and move on. Otherwise,
- * we were the only owner of the block, so free the extent, which will
- * also remove the rmap.
- *
- * XXX: XFS doesn't support detecting the case where a single block
- * metadata structure is crosslinked with a multi-block structure
- * because the buffer cache doesn't detect aliasing problems, so we
- * can't fix 100% of crosslinking problems (yet). The verifiers will
- * blow on writeout, the filesystem will shut down, and the admin gets
- * to run xfs_repair.
- */
- if (has_other_rmap)
- error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
- 1, rs->oinfo);
- else if (rs->resv == XFS_AG_RESV_AGFL)
- error = xrep_put_freelist(sc, agbno);
- else
- error = xfs_free_extent(sc->tp, sc->sa.pag, agbno, 1, rs->oinfo,
- rs->resv);
- if (agf_bp != sc->sa.agf_bp)
- xfs_trans_brelse(sc->tp, agf_bp);
- if (error)
- return error;
-
- if (sc->ip)
- return xfs_trans_roll_inode(&sc->tp, sc->ip);
- return xrep_roll_ag_trans(sc);
-
-out_free:
- if (agf_bp != sc->sa.agf_bp)
- xfs_trans_brelse(sc->tp, agf_bp);
- return error;
-}
-
-/* Dispose of every block of every extent in the bitmap. */
-int
-xrep_reap_extents(
- struct xfs_scrub *sc,
- struct xbitmap *bitmap,
- const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
-{
- struct xrep_reap_state rs = {
- .sc = sc,
- .oinfo = oinfo,
- .resv = type,
- };
-
- ASSERT(xfs_has_rmapbt(sc->mp));
-
- return xbitmap_walk_bits(bitmap, xrep_reap_block, &rs);
-}
-
/*
* Finding per-AG Btree Roots for AGF/AGI Reconstruction
*
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index dce791c679ee..60d2a9ae5f2e 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -8,6 +8,8 @@
#include "xfs_quota_defs.h"
+struct xchk_stats_run;
+
static inline int xrep_notsupported(struct xfs_scrub *sc)
{
return -EOPNOTSUPP;
@@ -15,28 +17,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
#ifdef CONFIG_XFS_ONLINE_REPAIR
+/*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS (128)
+
+
/* Repair helpers */
-int xrep_attempt(struct xfs_scrub *sc);
+int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_defer_finish(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
-int xrep_alloc_ag_block(struct xfs_scrub *sc,
- const struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
- enum xfs_ag_resv_type resv);
-int xrep_init_btblock(struct xfs_scrub *sc, xfs_fsblock_t fsb,
- struct xfs_buf **bpp, xfs_btnum_t btnum,
- const struct xfs_buf_ops *ops);
struct xbitmap;
struct xagb_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
-int xrep_invalidate_blocks(struct xfs_scrub *sc, struct xbitmap *btlist);
-int xrep_reap_extents(struct xfs_scrub *sc, struct xbitmap *exlist,
- const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
struct xrep_find_ag_btree {
/* in: rmap owner of the btree we're looking for */
@@ -70,7 +72,8 @@ int xrep_agi(struct xfs_scrub *sc);
static inline int
xrep_attempt(
- struct xfs_scrub *sc)
+ struct xfs_scrub *sc,
+ struct xchk_stats_run *run)
{
return -EOPNOTSUPP;
}
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index e7dace7b4be8..008ddb599e13 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -19,19 +19,20 @@
/* Set us up with the realtime metadata locked. */
int
-xchk_setup_rt(
+xchk_setup_rtbitmap(
struct xfs_scrub *sc)
{
int error;
- error = xchk_setup_fs(sc);
+ error = xchk_trans_alloc(sc, 0);
if (error)
return error;
- sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP;
- sc->ip = sc->mp->m_rbmip;
- xfs_ilock(sc->ip, sc->ilock_flags);
+ error = xchk_install_live_inode(sc, sc->mp->m_rbmip);
+ if (error)
+ return error;
+ xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
return 0;
}
@@ -123,43 +124,6 @@ out:
return error;
}
-/* Scrub the realtime summary. */
-int
-xchk_rtsummary(
- struct xfs_scrub *sc)
-{
- struct xfs_inode *rsumip = sc->mp->m_rsumip;
- struct xfs_inode *old_ip = sc->ip;
- uint old_ilock_flags = sc->ilock_flags;
- int error = 0;
-
- /*
- * We ILOCK'd the rt bitmap ip in the setup routine, now lock the
- * rt summary ip in compliance with the rt inode locking rules.
- *
- * Since we switch sc->ip to rsumip we have to save the old ilock
- * flags so that we don't mix up the inode state that @sc tracks.
- */
- sc->ip = rsumip;
- sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM;
- xfs_ilock(sc->ip, sc->ilock_flags);
-
- /* Invoke the fork scrubber. */
- error = xchk_metadata_inode_forks(sc);
- if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
- goto out;
-
- /* XXX: implement this some day */
- xchk_set_incomplete(sc);
-out:
- /* Switch back to the rtbitmap inode and lock flags. */
- xfs_iunlock(sc->ip, sc->ilock_flags);
- sc->ilock_flags = old_ilock_flags;
- sc->ip = old_ip;
- return error;
-}
-
-
/* xref check that the extent is not free in the rtbitmap */
void
xchk_xref_is_used_rt_space(
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
new file mode 100644
index 000000000000..437ed9acbb27
--- /dev/null
+++ b/fs/xfs/scrub/rtsummary.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2017-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+
+/*
+ * Realtime Summary
+ * ================
+ *
+ * We check the realtime summary by scanning the realtime bitmap file to create
+ * a new summary file incore, and then we compare the computed version against
+ * the ondisk version. We use the 'xfile' functionality to store this
+ * (potentially large) amount of data in pageable memory.
+ */
+
+/* Set us up to check the rtsummary file. */
+int
+xchk_setup_rtsummary(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /*
+ * Create an xfile to construct a new rtsummary file. The xfile allows
+ * us to avoid pinning kernel memory for this purpose.
+ */
+ descr = xchk_xfile_descr(sc, "realtime summary file");
+ error = xfile_create(descr, mp->m_rsumsize, &sc->xfile);
+ kfree(descr);
+ if (error)
+ return error;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ /* Allocate a memory buffer for the summary comparison. */
+ sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ error = xchk_install_live_inode(sc, mp->m_rsumip);
+ if (error)
+ return error;
+
+ /*
+ * Locking order requires us to take the rtbitmap first. We must be
+ * careful to unlock it ourselves when we are done with the rtbitmap
+ * file since the scrub infrastructure won't do that for us. Only
+ * then we can lock the rtsummary inode.
+ */
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+ return 0;
+}
+
+/* Helper functions to record suminfo words in an xfile. */
+
+typedef unsigned int xchk_rtsumoff_t;
+
+static inline int
+xfsum_load(
+ struct xfs_scrub *sc,
+ xchk_rtsumoff_t sumoff,
+ xfs_suminfo_t *info)
+{
+ return xfile_obj_load(sc->xfile, info, sizeof(xfs_suminfo_t),
+ sumoff << XFS_WORDLOG);
+}
+
+static inline int
+xfsum_store(
+ struct xfs_scrub *sc,
+ xchk_rtsumoff_t sumoff,
+ const xfs_suminfo_t info)
+{
+ return xfile_obj_store(sc->xfile, &info, sizeof(xfs_suminfo_t),
+ sumoff << XFS_WORDLOG);
+}
+
+static inline int
+xfsum_copyout(
+ struct xfs_scrub *sc,
+ xchk_rtsumoff_t sumoff,
+ xfs_suminfo_t *info,
+ unsigned int nr_words)
+{
+ return xfile_obj_load(sc->xfile, info, nr_words << XFS_WORDLOG,
+ sumoff << XFS_WORDLOG);
+}
+
+/* Update the summary file to reflect the free extent that we've accumulated. */
+STATIC int
+xchk_rtsum_record_free(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xfs_scrub *sc = priv;
+ xfs_fileoff_t rbmoff;
+ xfs_rtblock_t rtbno;
+ xfs_filblks_t rtlen;
+ xchk_rtsumoff_t offs;
+ unsigned int lenlog;
+ xfs_suminfo_t v = 0;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /* Compute the relevant location in the rtsum file. */
+ rbmoff = XFS_BITTOBLOCK(mp, rec->ar_startext);
+ lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
+ offs = XFS_SUMOFFS(mp, lenlog, rbmoff);
+
+ rtbno = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rtlen = rec->ar_extcount * mp->m_sb.sb_rextsize;
+
+ if (!xfs_verify_rtext(mp, rtbno, rtlen)) {
+ xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return -EFSCORRUPTED;
+ }
+
+ /* Bump the summary count. */
+ error = xfsum_load(sc, offs, &v);
+ if (error)
+ return error;
+
+ v++;
+ trace_xchk_rtsum_record_free(mp, rec->ar_startext, rec->ar_extcount,
+ lenlog, offs, v);
+
+ return xfsum_store(sc, offs, v);
+}
+
+/* Compute the realtime summary from the realtime bitmap. */
+STATIC int
+xchk_rtsum_compute(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long rtbmp_bytes;
+
+ /* If the bitmap size doesn't match the computed size, bail. */
+ rtbmp_bytes = howmany_64(mp->m_sb.sb_rextents, NBBY);
+ if (roundup_64(rtbmp_bytes, mp->m_sb.sb_blocksize) !=
+ mp->m_rbmip->i_disk_size)
+ return -EFSCORRUPTED;
+
+ return xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtsum_record_free,
+ sc);
+}
+
+/* Compare the rtsummary file against the one we computed. */
+STATIC int
+xchk_rtsum_compare(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t off;
+ xchk_rtsumoff_t sumoff = 0;
+ int nmap;
+
+ for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* Make sure we have a written extent. */
+ nmap = 1;
+ error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap,
+ XFS_DATA_FORK);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+ return error;
+
+ if (nmap != 1 || !xfs_bmap_is_written_extent(&map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ return 0;
+ }
+
+ /* Read a block's worth of ondisk rtsummary file. */
+ error = xfs_rtbuf_get(mp, sc->tp, off, 1, &bp);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
+ return error;
+
+ /* Read a block's worth of computed rtsummary file. */
+ error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
+ if (error) {
+ xfs_trans_brelse(sc->tp, bp);
+ return error;
+ }
+
+ if (memcmp(bp->b_addr, sc->buf,
+ mp->m_blockwsize << XFS_WORDLOG) != 0)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+
+ xfs_trans_brelse(sc->tp, bp);
+ sumoff += mp->m_blockwsize;
+ }
+
+ return 0;
+}
+
+/* Scrub the realtime summary. */
+int
+xchk_rtsummary(
+ struct xfs_scrub *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ /* Invoke the fork scrubber. */
+ error = xchk_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ goto out_rbm;
+
+ /* Construct the new summary file from the rtbitmap. */
+ error = xchk_rtsum_compute(sc);
+ if (error == -EFSCORRUPTED) {
+ /*
+ * EFSCORRUPTED means the rtbitmap is corrupt, which is an xref
+ * error since we're checking the summary file.
+ */
+ xchk_ino_xref_set_corrupt(sc, mp->m_rbmip->i_ino);
+ error = 0;
+ goto out_rbm;
+ }
+ if (error)
+ goto out_rbm;
+
+ /* Does the computed summary file match the actual rtsummary file? */
+ error = xchk_rtsum_compare(sc);
+
+out_rbm:
+ /* Unlock the rtbitmap since we're done with it. */
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ return error;
+}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 3d98f604765e..4849efcaa33a 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -22,6 +22,8 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/health.h"
+#include "scrub/stats.h"
+#include "scrub/xfile.h"
/*
* Online Scrub and Repair
@@ -166,8 +168,6 @@ xchk_teardown(
struct xfs_scrub *sc,
int error)
{
- struct xfs_inode *ip_in = XFS_I(file_inode(sc->file));
-
xchk_ag_free(sc, &sc->sa);
if (sc->tp) {
if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
@@ -178,14 +178,18 @@ xchk_teardown(
}
if (sc->ip) {
if (sc->ilock_flags)
- xfs_iunlock(sc->ip, sc->ilock_flags);
- if (sc->ip != ip_in &&
- !xfs_internal_inum(sc->mp, sc->ip->i_ino))
- xchk_irele(sc, sc->ip);
+ xchk_iunlock(sc, sc->ilock_flags);
+ xchk_irele(sc, sc->ip);
sc->ip = NULL;
}
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
+ sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
mnt_drop_write_file(sc->file);
+ }
+ if (sc->xfile) {
+ xfile_destroy(sc->xfile);
+ sc->xfile = NULL;
+ }
if (sc->buf) {
if (sc->buf_cleanup)
sc->buf_cleanup(sc->buf);
@@ -320,14 +324,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_FS,
- .setup = xchk_setup_rt,
+ .setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
- .setup = xchk_setup_rt,
+ .setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.has = xfs_has_realtime,
.repair = xrep_notsupported,
@@ -407,6 +411,11 @@ xchk_validate_inputs(
goto out;
}
+ /* No rebuild without repair. */
+ if ((sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) &&
+ !(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return -EINVAL;
+
/*
* We only want to repair read-write v5+ filesystems. Defer the check
* for ops->repair until after our scrub confirms that we need to
@@ -461,8 +470,10 @@ xfs_scrub_metadata(
struct file *file,
struct xfs_scrub_metadata *sm)
{
+ struct xchk_stats_run run = { };
struct xfs_scrub *sc;
struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount;
+ u64 check_start;
int error = 0;
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -505,6 +516,8 @@ retry_op:
error = mnt_want_write_file(sc->file);
if (error)
goto out_sc;
+
+ sc->flags |= XCHK_HAVE_FREEZE_PROT;
}
/* Set up for the operation. */
@@ -517,7 +530,9 @@ retry_op:
goto out_teardown;
/* Scrub for errors. */
+ check_start = xchk_stats_now();
error = sc->ops->scrub(sc);
+ run.scrub_ns += xchk_stats_elapsed_ns(check_start);
if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
goto try_harder;
if (error == -ECHRNG && !(sc->flags & XCHK_NEED_DRAIN))
@@ -529,15 +544,16 @@ retry_op:
if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED)) {
- bool needs_fix;
+ bool needs_fix = xchk_needs_repair(sc->sm);
+
+ /* Userspace asked us to rebuild the structure regardless. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
+ needs_fix = true;
/* Let debug users force us into the repair routines. */
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ needs_fix = true;
- needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT |
- XFS_SCRUB_OFLAG_PREEN));
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
@@ -551,7 +567,7 @@ retry_op:
* If it's broken, userspace wants us to fix it, and we haven't
* already tried to fix it, then attempt a repair.
*/
- error = xrep_attempt(sc);
+ error = xrep_attempt(sc, &run);
if (error == -EAGAIN) {
/*
* Either the repair function succeeded or it couldn't
@@ -572,6 +588,8 @@ out_nofix:
out_teardown:
error = xchk_teardown(sc, error);
out_sc:
+ if (error != -ENOENT)
+ xchk_stats_merge(mp, sm, &run);
kfree(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
@@ -585,6 +603,7 @@ need_drain:
if (error)
goto out_sc;
sc->flags |= XCHK_NEED_DRAIN;
+ run.retries++;
goto retry_op;
try_harder:
/*
@@ -596,5 +615,6 @@ try_harder:
if (error)
goto out_sc;
sc->flags |= XCHK_TRY_HARDER;
+ run.retries++;
goto retry_op;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index e113f2f5c254..1ef9c6b4842a 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -88,6 +88,10 @@ struct xfs_scrub {
*/
void (*buf_cleanup)(void *buf);
+ /* xfile used by the scrubbers; freed at teardown. */
+ struct xfile *xfile;
+
+ /* Lock flags for @ip. */
uint ilock_flags;
/* See the XCHK/XREP state flags below. */
@@ -106,6 +110,7 @@ struct xfs_scrub {
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
+#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
new file mode 100644
index 000000000000..cd91db4a5548
--- /dev/null
+++ b/fs/xfs/scrub/stats.c
@@ -0,0 +1,408 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_sysfs.h"
+#include "xfs_btree.h"
+#include "xfs_super.h"
+#include "scrub/scrub.h"
+#include "scrub/stats.h"
+#include "scrub/trace.h"
+
+struct xchk_scrub_stats {
+ /* all 32-bit counters here */
+
+ /* checking stats */
+ uint32_t invocations;
+ uint32_t clean;
+ uint32_t corrupt;
+ uint32_t preen;
+ uint32_t xfail;
+ uint32_t xcorrupt;
+ uint32_t incomplete;
+ uint32_t warning;
+ uint32_t retries;
+
+ /* repair stats */
+ uint32_t repair_invocations;
+ uint32_t repair_success;
+
+ /* all 64-bit items here */
+
+ /* runtimes */
+ uint64_t checktime_us;
+ uint64_t repairtime_us;
+
+ /* non-counter state must go at the end for clearall */
+ spinlock_t css_lock;
+};
+
+struct xchk_stats {
+ struct dentry *cs_debugfs;
+ struct xchk_scrub_stats cs_stats[XFS_SCRUB_TYPE_NR];
+};
+
+
+static struct xchk_stats global_stats;
+
+static const char *name_map[XFS_SCRUB_TYPE_NR] = {
+ [XFS_SCRUB_TYPE_SB] = "sb",
+ [XFS_SCRUB_TYPE_AGF] = "agf",
+ [XFS_SCRUB_TYPE_AGFL] = "agfl",
+ [XFS_SCRUB_TYPE_AGI] = "agi",
+ [XFS_SCRUB_TYPE_BNOBT] = "bnobt",
+ [XFS_SCRUB_TYPE_CNTBT] = "cntbt",
+ [XFS_SCRUB_TYPE_INOBT] = "inobt",
+ [XFS_SCRUB_TYPE_FINOBT] = "finobt",
+ [XFS_SCRUB_TYPE_RMAPBT] = "rmapbt",
+ [XFS_SCRUB_TYPE_REFCNTBT] = "refcountbt",
+ [XFS_SCRUB_TYPE_INODE] = "inode",
+ [XFS_SCRUB_TYPE_BMBTD] = "bmapbtd",
+ [XFS_SCRUB_TYPE_BMBTA] = "bmapbta",
+ [XFS_SCRUB_TYPE_BMBTC] = "bmapbtc",
+ [XFS_SCRUB_TYPE_DIR] = "directory",
+ [XFS_SCRUB_TYPE_XATTR] = "xattr",
+ [XFS_SCRUB_TYPE_SYMLINK] = "symlink",
+ [XFS_SCRUB_TYPE_PARENT] = "parent",
+ [XFS_SCRUB_TYPE_RTBITMAP] = "rtbitmap",
+ [XFS_SCRUB_TYPE_RTSUM] = "rtsummary",
+ [XFS_SCRUB_TYPE_UQUOTA] = "usrquota",
+ [XFS_SCRUB_TYPE_GQUOTA] = "grpquota",
+ [XFS_SCRUB_TYPE_PQUOTA] = "prjquota",
+ [XFS_SCRUB_TYPE_FSCOUNTERS] = "fscounters",
+};
+
+/* Format the scrub stats into a text buffer, similar to pcp style. */
+STATIC ssize_t
+xchk_stats_format(
+ struct xchk_stats *cs,
+ char *buf,
+ size_t remaining)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+ ssize_t copied = 0;
+ int ret = 0;
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+ if (!name_map[i])
+ continue;
+
+ ret = scnprintf(buf, remaining,
+ "%s %u %u %u %u %u %u %u %u %u %llu %u %u %llu\n",
+ name_map[i],
+ (unsigned int)css->invocations,
+ (unsigned int)css->clean,
+ (unsigned int)css->corrupt,
+ (unsigned int)css->preen,
+ (unsigned int)css->xfail,
+ (unsigned int)css->xcorrupt,
+ (unsigned int)css->incomplete,
+ (unsigned int)css->warning,
+ (unsigned int)css->retries,
+ (unsigned long long)css->checktime_us,
+ (unsigned int)css->repair_invocations,
+ (unsigned int)css->repair_success,
+ (unsigned long long)css->repairtime_us);
+ if (ret <= 0)
+ break;
+
+ remaining -= ret;
+ copied += ret;
+ buf += ret;
+ }
+
+ return copied > 0 ? copied : ret;
+}
+
+/* Estimate the worst case buffer size required to hold the whole report. */
+STATIC size_t
+xchk_stats_estimate_bufsize(
+ struct xchk_stats *cs)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+ size_t field_width;
+ size_t ret = 0;
+
+ /* 4294967296 plus one space for each u32 field */
+ field_width = 11 * (offsetof(struct xchk_scrub_stats, checktime_us) /
+ sizeof(uint32_t));
+
+ /* 18446744073709551615 plus one space for each u64 field */
+ field_width += 21 * ((offsetof(struct xchk_scrub_stats, css_lock) -
+ offsetof(struct xchk_scrub_stats, checktime_us)) /
+ sizeof(uint64_t));
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+ if (!name_map[i])
+ continue;
+
+ /* name plus one space */
+ ret += 1 + strlen(name_map[i]);
+
+ /* all fields, plus newline */
+ ret += field_width + 1;
+ }
+
+ return ret;
+}
+
+/* Clear all counters. */
+STATIC void
+xchk_stats_clearall(
+ struct xchk_stats *cs)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++) {
+ spin_lock(&css->css_lock);
+ memset(css, 0, offsetof(struct xchk_scrub_stats, css_lock));
+ spin_unlock(&css->css_lock);
+ }
+}
+
+#define XFS_SCRUB_OFLAG_UNCLEAN (XFS_SCRUB_OFLAG_CORRUPT | \
+ XFS_SCRUB_OFLAG_PREEN | \
+ XFS_SCRUB_OFLAG_XFAIL | \
+ XFS_SCRUB_OFLAG_XCORRUPT | \
+ XFS_SCRUB_OFLAG_INCOMPLETE | \
+ XFS_SCRUB_OFLAG_WARNING)
+
+STATIC void
+xchk_stats_merge_one(
+ struct xchk_stats *cs,
+ const struct xfs_scrub_metadata *sm,
+ const struct xchk_stats_run *run)
+{
+ struct xchk_scrub_stats *css;
+
+ if (sm->sm_type >= XFS_SCRUB_TYPE_NR) {
+ ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR);
+ return;
+ }
+
+ css = &cs->cs_stats[sm->sm_type];
+ spin_lock(&css->css_lock);
+ css->invocations++;
+ if (!(sm->sm_flags & XFS_SCRUB_OFLAG_UNCLEAN))
+ css->clean++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ css->corrupt++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_PREEN)
+ css->preen++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_XFAIL)
+ css->xfail++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)
+ css->xcorrupt++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ css->incomplete++;
+ if (sm->sm_flags & XFS_SCRUB_OFLAG_WARNING)
+ css->warning++;
+ css->retries += run->retries;
+ css->checktime_us += howmany_64(run->scrub_ns, NSEC_PER_USEC);
+
+ if (run->repair_attempted)
+ css->repair_invocations++;
+ if (run->repair_succeeded)
+ css->repair_success++;
+ css->repairtime_us += howmany_64(run->repair_ns, NSEC_PER_USEC);
+ spin_unlock(&css->css_lock);
+}
+
+/* Merge these scrub-run stats into the global and mount stat data. */
+void
+xchk_stats_merge(
+ struct xfs_mount *mp,
+ const struct xfs_scrub_metadata *sm,
+ const struct xchk_stats_run *run)
+{
+ xchk_stats_merge_one(&global_stats, sm, run);
+ xchk_stats_merge_one(mp->m_scrub_stats, sm, run);
+}
+
+/* debugfs boilerplate */
+
+static ssize_t
+xchk_scrub_stats_read(
+ struct file *file,
+ char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct xchk_stats *cs = file->private_data;
+ char *buf;
+ size_t bufsize;
+ ssize_t avail, ret;
+
+ /*
+ * This generates stringly snapshot of all the scrub counters, so we
+ * do not want userspace to receive garbled text from multiple calls.
+ * If the file position is greater than 0, return a short read.
+ */
+ if (*ppos > 0)
+ return 0;
+
+ bufsize = xchk_stats_estimate_bufsize(cs);
+
+ buf = kvmalloc(bufsize, XCHK_GFP_FLAGS);
+ if (!buf)
+ return -ENOMEM;
+
+ avail = xchk_stats_format(cs, buf, bufsize);
+ if (avail < 0) {
+ ret = avail;
+ goto out;
+ }
+
+ ret = simple_read_from_buffer(ubuf, count, ppos, buf, avail);
+out:
+ kvfree(buf);
+ return ret;
+}
+
+static const struct file_operations scrub_stats_fops = {
+ .open = simple_open,
+ .read = xchk_scrub_stats_read,
+};
+
+static ssize_t
+xchk_clear_scrub_stats_write(
+ struct file *file,
+ const char __user *ubuf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct xchk_stats *cs = file->private_data;
+ unsigned int val;
+ int ret;
+
+ ret = kstrtouint_from_user(ubuf, count, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val != 1)
+ return -EINVAL;
+
+ xchk_stats_clearall(cs);
+ return count;
+}
+
+static const struct file_operations clear_scrub_stats_fops = {
+ .open = simple_open,
+ .write = xchk_clear_scrub_stats_write,
+};
+
+/* Initialize the stats object. */
+STATIC int
+xchk_stats_init(
+ struct xchk_stats *cs,
+ struct xfs_mount *mp)
+{
+ struct xchk_scrub_stats *css = &cs->cs_stats[0];
+ unsigned int i;
+
+ for (i = 0; i < XFS_SCRUB_TYPE_NR; i++, css++)
+ spin_lock_init(&css->css_lock);
+
+ return 0;
+}
+
+/* Connect the stats object to debugfs. */
+void
+xchk_stats_register(
+ struct xchk_stats *cs,
+ struct dentry *parent)
+{
+ if (!parent)
+ return;
+
+ cs->cs_debugfs = xfs_debugfs_mkdir("scrub", parent);
+ if (!cs->cs_debugfs)
+ return;
+
+ debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+ &scrub_stats_fops);
+ debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+ &clear_scrub_stats_fops);
+}
+
+/* Free all resources related to the stats object. */
+STATIC int
+xchk_stats_teardown(
+ struct xchk_stats *cs)
+{
+ return 0;
+}
+
+/* Disconnect the stats object from debugfs. */
+void
+xchk_stats_unregister(
+ struct xchk_stats *cs)
+{
+ debugfs_remove(cs->cs_debugfs);
+}
+
+/* Initialize global stats and register them */
+int __init
+xchk_global_stats_setup(
+ struct dentry *parent)
+{
+ int error;
+
+ error = xchk_stats_init(&global_stats, NULL);
+ if (error)
+ return error;
+
+ xchk_stats_register(&global_stats, parent);
+ return 0;
+}
+
+/* Unregister global stats and tear them down */
+void
+xchk_global_stats_teardown(void)
+{
+ xchk_stats_unregister(&global_stats);
+ xchk_stats_teardown(&global_stats);
+}
+
+/* Allocate per-mount stats */
+int
+xchk_mount_stats_alloc(
+ struct xfs_mount *mp)
+{
+ struct xchk_stats *cs;
+ int error;
+
+ cs = kvzalloc(sizeof(struct xchk_stats), GFP_KERNEL);
+ if (!cs)
+ return -ENOMEM;
+
+ error = xchk_stats_init(cs, mp);
+ if (error)
+ goto out_free;
+
+ mp->m_scrub_stats = cs;
+ return 0;
+out_free:
+ kvfree(cs);
+ return error;
+}
+
+/* Free per-mount stats */
+void
+xchk_mount_stats_free(
+ struct xfs_mount *mp)
+{
+ xchk_stats_teardown(mp->m_scrub_stats);
+ kvfree(mp->m_scrub_stats);
+ mp->m_scrub_stats = NULL;
+}
diff --git a/fs/xfs/scrub/stats.h b/fs/xfs/scrub/stats.h
new file mode 100644
index 000000000000..b358ad8d8b90
--- /dev/null
+++ b/fs/xfs/scrub/stats.h
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_STATS_H__
+#define __XFS_SCRUB_STATS_H__
+
+struct xchk_stats_run {
+ u64 scrub_ns;
+ u64 repair_ns;
+ unsigned int retries;
+ bool repair_attempted;
+ bool repair_succeeded;
+};
+
+#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
+struct xchk_stats;
+
+int __init xchk_global_stats_setup(struct dentry *parent);
+void xchk_global_stats_teardown(void);
+
+int xchk_mount_stats_alloc(struct xfs_mount *mp);
+void xchk_mount_stats_free(struct xfs_mount *mp);
+
+void xchk_stats_register(struct xchk_stats *cs, struct dentry *parent);
+void xchk_stats_unregister(struct xchk_stats *cs);
+
+void xchk_stats_merge(struct xfs_mount *mp, const struct xfs_scrub_metadata *sm,
+ const struct xchk_stats_run *run);
+
+static inline u64 xchk_stats_now(void) { return ktime_get_ns(); }
+static inline u64 xchk_stats_elapsed_ns(u64 since)
+{
+ u64 now = xchk_stats_now();
+
+ /*
+ * If the system doesn't have a high enough resolution clock, charge at
+ * least one nanosecond so that our stats don't report instantaneous
+ * runtimes.
+ */
+ if (now == since)
+ return 1;
+
+ return now - since;
+}
+#else
+# define xchk_global_stats_setup(parent) (0)
+# define xchk_global_stats_teardown() ((void)0)
+# define xchk_mount_stats_alloc(mp) (0)
+# define xchk_mount_stats_free(mp) ((void)0)
+# define xchk_stats_register(cs, parent) ((void)0)
+# define xchk_stats_unregister(cs) ((void)0)
+# define xchk_stats_now() (0)
+# define xchk_stats_elapsed_ns(x) (0 * (x))
+# define xchk_stats_merge(mp, sm, run) ((void)0)
+#endif /* CONFIG_XFS_ONLINE_SCRUB_STATS */
+
+#endif /* __XFS_SCRUB_STATS_H__ */
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 0a975439d2b6..46249e7b17e0 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -12,8 +12,10 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "scrub/scrub.h"
#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index b3894daeb86a..cbd4d01e253c 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -16,6 +16,10 @@
#include <linux/tracepoint.h>
#include "xfs_bit.h"
+struct xfile;
+struct xfarray;
+struct xfarray_sortinfo;
+
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
* TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
@@ -94,10 +98,12 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \
{ XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \
{ XFS_SCRUB_OFLAG_WARNING, "warning" }, \
- { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }
+ { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" }, \
+ { XFS_SCRUB_IFLAG_FORCE_REBUILD, "rebuild" }
#define XFS_SCRUB_STATE_STRINGS \
{ XCHK_TRY_HARDER, "try_harder" }, \
+ { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
@@ -635,6 +641,28 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
__entry->cluster_ino)
)
+TRACE_EVENT(xchk_inode_is_allocated,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(unsigned long, iflags)
+ __field(umode_t, mode)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->iflags = ip->i_flags;
+ __entry->mode = VFS_I(ip)->i_mode;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx mode 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->iflags,
+ __entry->mode)
+);
+
TRACE_EVENT(xchk_fscounters_calc,
TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
uint64_t fdblocks, uint64_t delalloc),
@@ -693,6 +721,31 @@ TRACE_EVENT(xchk_fscounters_within_range,
__entry->old_value)
)
+DECLARE_EVENT_CLASS(xchk_fsfreeze_class,
+ TP_PROTO(struct xfs_scrub *sc, int error),
+ TP_ARGS(sc, error),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, type)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->type = sc->sm->sm_type;
+ __entry->error = error;
+ ),
+ TP_printk("dev %d:%d type %s error %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
+ __entry->error)
+);
+#define DEFINE_XCHK_FSFREEZE_EVENT(name) \
+DEFINE_EVENT(xchk_fsfreeze_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, int error), \
+ TP_ARGS(sc, error))
+DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze);
+DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw);
+
TRACE_EVENT(xchk_refcount_incorrect,
TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
xfs_nlink_t seen),
@@ -725,13 +778,302 @@ TRACE_EVENT(xchk_refcount_incorrect,
__entry->seen)
)
+TRACE_EVENT(xfile_create,
+ TP_PROTO(struct xfile *xf),
+ TP_ARGS(xf),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned long, ino)
+ __array(char, pathname, 256)
+ ),
+ TP_fast_assign(
+ char pathname[257];
+ char *path;
+
+ __entry->ino = file_inode(xf->file)->i_ino;
+ memset(pathname, 0, sizeof(pathname));
+ path = file_path(xf->file, pathname, sizeof(pathname) - 1);
+ if (IS_ERR(path))
+ path = "(unknown)";
+ strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+ ),
+ TP_printk("xfino 0x%lx path '%s'",
+ __entry->ino,
+ __entry->pathname)
+);
+
+TRACE_EVENT(xfile_destroy,
+ TP_PROTO(struct xfile *xf),
+ TP_ARGS(xf),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, bytes)
+ __field(loff_t, size)
+ ),
+ TP_fast_assign(
+ struct xfile_stat statbuf;
+ int ret;
+
+ ret = xfile_stat(xf, &statbuf);
+ if (!ret) {
+ __entry->bytes = statbuf.bytes;
+ __entry->size = statbuf.size;
+ } else {
+ __entry->bytes = -1;
+ __entry->size = -1;
+ }
+ __entry->ino = file_inode(xf->file)->i_ino;
+ ),
+ TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
+ __entry->ino,
+ __entry->bytes,
+ __entry->size)
+);
+
+DECLARE_EVENT_CLASS(xfile_class,
+ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount),
+ TP_ARGS(xf, pos, bytecount),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, bytes_used)
+ __field(loff_t, pos)
+ __field(loff_t, size)
+ __field(unsigned long long, bytecount)
+ ),
+ TP_fast_assign(
+ struct xfile_stat statbuf;
+ int ret;
+
+ ret = xfile_stat(xf, &statbuf);
+ if (!ret) {
+ __entry->bytes_used = statbuf.bytes;
+ __entry->size = statbuf.size;
+ } else {
+ __entry->bytes_used = -1;
+ __entry->size = -1;
+ }
+ __entry->ino = file_inode(xf->file)->i_ino;
+ __entry->pos = pos;
+ __entry->bytecount = bytecount;
+ ),
+ TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
+ __entry->ino,
+ __entry->bytes_used,
+ __entry->pos,
+ __entry->bytecount,
+ __entry->size)
+);
+#define DEFINE_XFILE_EVENT(name) \
+DEFINE_EVENT(xfile_class, name, \
+ TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
+ TP_ARGS(xf, pos, bytecount))
+DEFINE_XFILE_EVENT(xfile_pread);
+DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_seek_data);
+DEFINE_XFILE_EVENT(xfile_get_page);
+DEFINE_XFILE_EVENT(xfile_put_page);
+
+TRACE_EVENT(xfarray_create,
+ TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
+ TP_ARGS(xfa, required_capacity),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(uint64_t, max_nr)
+ __field(size_t, obj_size)
+ __field(int, obj_size_log)
+ __field(unsigned long long, required_capacity)
+ ),
+ TP_fast_assign(
+ __entry->max_nr = xfa->max_nr;
+ __entry->obj_size = xfa->obj_size;
+ __entry->obj_size_log = xfa->obj_size_log;
+ __entry->ino = file_inode(xfa->xfile->file)->i_ino;
+ __entry->required_capacity = required_capacity;
+ ),
+ TP_printk("xfino 0x%lx max_nr %llu reqd_nr %llu objsz %zu objszlog %d",
+ __entry->ino,
+ __entry->max_nr,
+ __entry->required_capacity,
+ __entry->obj_size,
+ __entry->obj_size_log)
+);
+
+TRACE_EVENT(xfarray_isort,
+ TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+ TP_ARGS(si, lo, hi),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, lo)
+ __field(unsigned long long, hi)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->lo = lo;
+ __entry->hi = hi;
+ ),
+ TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu",
+ __entry->ino,
+ __entry->lo,
+ __entry->hi,
+ __entry->hi - __entry->lo)
+);
+
+TRACE_EVENT(xfarray_pagesort,
+ TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+ TP_ARGS(si, lo, hi),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, lo)
+ __field(unsigned long long, hi)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->lo = lo;
+ __entry->hi = hi;
+ ),
+ TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu",
+ __entry->ino,
+ __entry->lo,
+ __entry->hi,
+ __entry->hi - __entry->lo)
+);
+
+TRACE_EVENT(xfarray_qsort,
+ TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
+ TP_ARGS(si, lo, hi),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, lo)
+ __field(unsigned long long, hi)
+ __field(int, stack_depth)
+ __field(int, max_stack_depth)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->lo = lo;
+ __entry->hi = hi;
+ __entry->stack_depth = si->stack_depth;
+ __entry->max_stack_depth = si->max_stack_depth;
+ ),
+ TP_printk("xfino 0x%lx lo %llu hi %llu elts %llu stack %d/%d",
+ __entry->ino,
+ __entry->lo,
+ __entry->hi,
+ __entry->hi - __entry->lo,
+ __entry->stack_depth,
+ __entry->max_stack_depth)
+);
+
+TRACE_EVENT(xfarray_sort,
+ TP_PROTO(struct xfarray_sortinfo *si, size_t bytes),
+ TP_ARGS(si, bytes),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+ __field(unsigned long long, nr)
+ __field(size_t, obj_size)
+ __field(size_t, bytes)
+ __field(unsigned int, max_stack_depth)
+ ),
+ TP_fast_assign(
+ __entry->nr = si->array->nr;
+ __entry->obj_size = si->array->obj_size;
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+ __entry->bytes = bytes;
+ __entry->max_stack_depth = si->max_stack_depth;
+ ),
+ TP_printk("xfino 0x%lx nr %llu objsz %zu stack %u bytes %zu",
+ __entry->ino,
+ __entry->nr,
+ __entry->obj_size,
+ __entry->max_stack_depth,
+ __entry->bytes)
+);
+
+TRACE_EVENT(xfarray_sort_stats,
+ TP_PROTO(struct xfarray_sortinfo *si, int error),
+ TP_ARGS(si, error),
+ TP_STRUCT__entry(
+ __field(unsigned long, ino)
+#ifdef DEBUG
+ __field(unsigned long long, loads)
+ __field(unsigned long long, stores)
+ __field(unsigned long long, compares)
+ __field(unsigned long long, heapsorts)
+#endif
+ __field(unsigned int, max_stack_depth)
+ __field(unsigned int, max_stack_used)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->ino = file_inode(si->array->xfile->file)->i_ino;
+#ifdef DEBUG
+ __entry->loads = si->loads;
+ __entry->stores = si->stores;
+ __entry->compares = si->compares;
+ __entry->heapsorts = si->heapsorts;
+#endif
+ __entry->max_stack_depth = si->max_stack_depth;
+ __entry->max_stack_used = si->max_stack_used;
+ __entry->error = error;
+ ),
+ TP_printk(
+#ifdef DEBUG
+ "xfino 0x%lx loads %llu stores %llu compares %llu heapsorts %llu stack_depth %u/%u error %d",
+#else
+ "xfino 0x%lx stack_depth %u/%u error %d",
+#endif
+ __entry->ino,
+#ifdef DEBUG
+ __entry->loads,
+ __entry->stores,
+ __entry->compares,
+ __entry->heapsorts,
+#endif
+ __entry->max_stack_used,
+ __entry->max_stack_depth,
+ __entry->error)
+);
+
+#ifdef CONFIG_XFS_RT
+TRACE_EVENT(xchk_rtsum_record_free,
+ TP_PROTO(struct xfs_mount *mp, xfs_rtblock_t start,
+ uint64_t len, unsigned int log, loff_t pos, xfs_suminfo_t v),
+ TP_ARGS(mp, start, len, log, pos, v),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(dev_t, rtdev)
+ __field(xfs_rtblock_t, start)
+ __field(unsigned long long, len)
+ __field(unsigned int, log)
+ __field(loff_t, pos)
+ __field(xfs_suminfo_t, v)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rtdev = mp->m_rtdev_targp->bt_dev;
+ __entry->start = start;
+ __entry->len = len;
+ __entry->log = log;
+ __entry->pos = pos;
+ __entry->v = v;
+ ),
+ TP_printk("dev %d:%d rtdev %d:%d rtx 0x%llx rtxcount 0x%llx log %u rsumpos 0x%llx sumcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ MAJOR(__entry->rtdev), MINOR(__entry->rtdev),
+ __entry->start,
+ __entry->len,
+ __entry->log,
+ __entry->pos,
+ __entry->v)
+);
+#endif /* CONFIG_XFS_RT */
+
/* repair tracepoints */
#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
DECLARE_EVENT_CLASS(xrep_extent_class,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agblock_t agbno, xfs_extlen_t len),
- TP_ARGS(mp, agno, agbno, len),
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(pag, agbno, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
@@ -739,8 +1081,8 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
__field(xfs_extlen_t, len)
),
TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
__entry->agbno = agbno;
__entry->len = len;
),
@@ -752,12 +1094,45 @@ DECLARE_EVENT_CLASS(xrep_extent_class,
);
#define DEFINE_REPAIR_EXTENT_EVENT(name) \
DEFINE_EVENT(xrep_extent_class, name, \
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
- xfs_agblock_t agbno, xfs_extlen_t len), \
- TP_ARGS(mp, agno, agbno, len))
-DEFINE_REPAIR_EXTENT_EVENT(xrep_dispose_btree_extent);
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(pag, agbno, len))
+DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_unmap_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_dispose_free_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xreap_agextent_binval);
DEFINE_REPAIR_EXTENT_EVENT(xrep_agfl_insert);
+DECLARE_EVENT_CLASS(xrep_reap_find_class,
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len,
+ bool crosslinked),
+ TP_ARGS(pag, agbno, len, crosslinked),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(bool, crosslinked)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->crosslinked = crosslinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x crosslinked %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->crosslinked ? 1 : 0)
+);
+#define DEFINE_REPAIR_REAP_FIND_EVENT(name) \
+DEFINE_EVENT(xrep_reap_find_class, name, \
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, xfs_extlen_t len, \
+ bool crosslinked), \
+ TP_ARGS(pag, agbno, len, crosslinked))
+DEFINE_REPAIR_REAP_FIND_EVENT(xreap_agextent_select);
+
DECLARE_EVENT_CLASS(xrep_rmap_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_agblock_t agbno, xfs_extlen_t len,
@@ -827,28 +1202,6 @@ TRACE_EVENT(xrep_refcount_extent_fn,
__entry->refcount)
)
-TRACE_EVENT(xrep_init_btblock,
- TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
- xfs_btnum_t btnum),
- TP_ARGS(mp, agno, agbno, btnum),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_agnumber_t, agno)
- __field(xfs_agblock_t, agbno)
- __field(uint32_t, btnum)
- ),
- TP_fast_assign(
- __entry->dev = mp->m_super->s_dev;
- __entry->agno = agno;
- __entry->agbno = agbno;
- __entry->btnum = btnum;
- ),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->agno,
- __entry->agbno,
- __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS))
-)
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
new file mode 100644
index 000000000000..f0f532c10a5a
--- /dev/null
+++ b/fs/xfs/scrub/xfarray.c
@@ -0,0 +1,1083 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2021-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+
+/*
+ * Large Arrays of Fixed-Size Records
+ * ==================================
+ *
+ * This memory array uses an xfile (which itself is a memfd "file") to store
+ * large numbers of fixed-size records in memory that can be paged out. This
+ * puts less stress on the memory reclaim algorithms during an online repair
+ * because we don't have to pin so much memory. However, array access is less
+ * direct than would be in a regular memory array. Access to the array is
+ * performed via indexed load and store methods, and an append method is
+ * provided for convenience. Array elements can be unset, which sets them to
+ * all zeroes. Unset entries are skipped during iteration, though direct loads
+ * will return a zeroed buffer. Callers are responsible for concurrency
+ * control.
+ */
+
+/*
+ * Pointer to scratch space. Because we can't access the xfile data directly,
+ * we allocate a small amount of memory on the end of the xfarray structure to
+ * buffer array items when we need space to store values temporarily.
+ */
+static inline void *xfarray_scratch(struct xfarray *array)
+{
+ return (array + 1);
+}
+
+/* Compute array index given an xfile offset. */
+static xfarray_idx_t
+xfarray_idx(
+ struct xfarray *array,
+ loff_t pos)
+{
+ if (array->obj_size_log >= 0)
+ return (xfarray_idx_t)pos >> array->obj_size_log;
+
+ return div_u64((xfarray_idx_t)pos, array->obj_size);
+}
+
+/* Compute xfile offset of array element. */
+static inline loff_t xfarray_pos(struct xfarray *array, xfarray_idx_t idx)
+{
+ if (array->obj_size_log >= 0)
+ return idx << array->obj_size_log;
+
+ return idx * array->obj_size;
+}
+
+/*
+ * Initialize a big memory array. Array records cannot be larger than a
+ * page, and the array cannot span more bytes than the page cache supports.
+ * If @required_capacity is nonzero, the maximum array size will be set to this
+ * quantity and the array creation will fail if the underlying storage cannot
+ * support that many records.
+ */
+int
+xfarray_create(
+ const char *description,
+ unsigned long long required_capacity,
+ size_t obj_size,
+ struct xfarray **arrayp)
+{
+ struct xfarray *array;
+ struct xfile *xfile;
+ int error;
+
+ ASSERT(obj_size < PAGE_SIZE);
+
+ error = xfile_create(description, 0, &xfile);
+ if (error)
+ return error;
+
+ error = -ENOMEM;
+ array = kzalloc(sizeof(struct xfarray) + obj_size, XCHK_GFP_FLAGS);
+ if (!array)
+ goto out_xfile;
+
+ array->xfile = xfile;
+ array->obj_size = obj_size;
+
+ if (is_power_of_2(obj_size))
+ array->obj_size_log = ilog2(obj_size);
+ else
+ array->obj_size_log = -1;
+
+ array->max_nr = xfarray_idx(array, MAX_LFS_FILESIZE);
+ trace_xfarray_create(array, required_capacity);
+
+ if (required_capacity > 0) {
+ if (array->max_nr < required_capacity) {
+ error = -ENOMEM;
+ goto out_xfarray;
+ }
+ array->max_nr = required_capacity;
+ }
+
+ *arrayp = array;
+ return 0;
+
+out_xfarray:
+ kfree(array);
+out_xfile:
+ xfile_destroy(xfile);
+ return error;
+}
+
+/* Destroy the array. */
+void
+xfarray_destroy(
+ struct xfarray *array)
+{
+ xfile_destroy(array->xfile);
+ kfree(array);
+}
+
+/* Load an element from the array. */
+int
+xfarray_load(
+ struct xfarray *array,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ if (idx >= array->nr)
+ return -ENODATA;
+
+ return xfile_obj_load(array->xfile, ptr, array->obj_size,
+ xfarray_pos(array, idx));
+}
+
+/* Is this array element potentially unset? */
+static inline bool
+xfarray_is_unset(
+ struct xfarray *array,
+ loff_t pos)
+{
+ void *temp = xfarray_scratch(array);
+ int error;
+
+ if (array->unset_slots == 0)
+ return false;
+
+ error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+ if (!error && xfarray_element_is_null(array, temp))
+ return true;
+
+ return false;
+}
+
+/*
+ * Unset an array element. If @idx is the last element in the array, the
+ * array will be truncated. Otherwise, the entry will be zeroed.
+ */
+int
+xfarray_unset(
+ struct xfarray *array,
+ xfarray_idx_t idx)
+{
+ void *temp = xfarray_scratch(array);
+ loff_t pos = xfarray_pos(array, idx);
+ int error;
+
+ if (idx >= array->nr)
+ return -ENODATA;
+
+ if (idx == array->nr - 1) {
+ array->nr--;
+ return 0;
+ }
+
+ if (xfarray_is_unset(array, pos))
+ return 0;
+
+ memset(temp, 0, array->obj_size);
+ error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+ if (error)
+ return error;
+
+ array->unset_slots++;
+ return 0;
+}
+
+/*
+ * Store an element in the array. The element must not be completely zeroed,
+ * because those are considered unset sparse elements.
+ */
+int
+xfarray_store(
+ struct xfarray *array,
+ xfarray_idx_t idx,
+ const void *ptr)
+{
+ int ret;
+
+ if (idx >= array->max_nr)
+ return -EFBIG;
+
+ ASSERT(!xfarray_element_is_null(array, ptr));
+
+ ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ xfarray_pos(array, idx));
+ if (ret)
+ return ret;
+
+ array->nr = max(array->nr, idx + 1);
+ return 0;
+}
+
+/* Is this array element NULL? */
+bool
+xfarray_element_is_null(
+ struct xfarray *array,
+ const void *ptr)
+{
+ return !memchr_inv(ptr, 0, array->obj_size);
+}
+
+/*
+ * Store an element anywhere in the array that is unset. If there are no
+ * unset slots, append the element to the array.
+ */
+int
+xfarray_store_anywhere(
+ struct xfarray *array,
+ const void *ptr)
+{
+ void *temp = xfarray_scratch(array);
+ loff_t endpos = xfarray_pos(array, array->nr);
+ loff_t pos;
+ int error;
+
+ /* Find an unset slot to put it in. */
+ for (pos = 0;
+ pos < endpos && array->unset_slots > 0;
+ pos += array->obj_size) {
+ error = xfile_obj_load(array->xfile, temp, array->obj_size,
+ pos);
+ if (error || !xfarray_element_is_null(array, temp))
+ continue;
+
+ error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+ pos);
+ if (error)
+ return error;
+
+ array->unset_slots--;
+ return 0;
+ }
+
+ /* No unset slots found; attach it on the end. */
+ array->unset_slots = 0;
+ return xfarray_append(array, ptr);
+}
+
+/* Return length of array. */
+uint64_t
+xfarray_length(
+ struct xfarray *array)
+{
+ return array->nr;
+}
+
+/*
+ * Decide which array item we're going to read as part of an _iter_get.
+ * @cur is the array index, and @pos is the file offset of that array index in
+ * the backing xfile. Returns ENODATA if we reach the end of the records.
+ *
+ * Reading from a hole in a sparse xfile causes page instantiation, so for
+ * iterating a (possibly sparse) array we need to figure out if the cursor is
+ * pointing at a totally uninitialized hole and move the cursor up if
+ * necessary.
+ */
+static inline int
+xfarray_find_data(
+ struct xfarray *array,
+ xfarray_idx_t *cur,
+ loff_t *pos)
+{
+ unsigned int pgoff = offset_in_page(*pos);
+ loff_t end_pos = *pos + array->obj_size - 1;
+ loff_t new_pos;
+
+ /*
+ * If the current array record is not adjacent to a page boundary, we
+ * are in the middle of the page. We do not need to move the cursor.
+ */
+ if (pgoff != 0 && pgoff + array->obj_size - 1 < PAGE_SIZE)
+ return 0;
+
+ /*
+ * Call SEEK_DATA on the last byte in the record we're about to read.
+ * If the record ends at (or crosses) the end of a page then we know
+ * that the first byte of the record is backed by pages and don't need
+ * to query it. If instead the record begins at the start of the page
+ * then we know that querying the last byte is just as good as querying
+ * the first byte, since records cannot be larger than a page.
+ *
+ * If the call returns the same file offset, we know this record is
+ * backed by real pages. We do not need to move the cursor.
+ */
+ new_pos = xfile_seek_data(array->xfile, end_pos);
+ if (new_pos == -ENXIO)
+ return -ENODATA;
+ if (new_pos < 0)
+ return new_pos;
+ if (new_pos == end_pos)
+ return 0;
+
+ /*
+ * Otherwise, SEEK_DATA told us how far up to move the file pointer to
+ * find more data. Move the array index to the first record past the
+ * byte offset we were given.
+ */
+ new_pos = roundup_64(new_pos, array->obj_size);
+ *cur = xfarray_idx(array, new_pos);
+ *pos = xfarray_pos(array, *cur);
+ return 0;
+}
+
+/*
+ * Starting at *idx, fetch the next non-null array entry and advance the index
+ * to set up the next _load_next call. Returns ENODATA if we reach the end of
+ * the array. Callers must set @*idx to XFARRAY_CURSOR_INIT before the first
+ * call to this function.
+ */
+int
+xfarray_load_next(
+ struct xfarray *array,
+ xfarray_idx_t *idx,
+ void *rec)
+{
+ xfarray_idx_t cur = *idx;
+ loff_t pos = xfarray_pos(array, cur);
+ int error;
+
+ do {
+ if (cur >= array->nr)
+ return -ENODATA;
+
+ /*
+ * Ask the backing store for the location of next possible
+ * written record, then retrieve that record.
+ */
+ error = xfarray_find_data(array, &cur, &pos);
+ if (error)
+ return error;
+ error = xfarray_load(array, cur, rec);
+ if (error)
+ return error;
+
+ cur++;
+ pos += array->obj_size;
+ } while (xfarray_element_is_null(array, rec));
+
+ *idx = cur;
+ return 0;
+}
+
+/* Sorting functions */
+
+#ifdef DEBUG
+# define xfarray_sort_bump_loads(si) do { (si)->loads++; } while (0)
+# define xfarray_sort_bump_stores(si) do { (si)->stores++; } while (0)
+# define xfarray_sort_bump_compares(si) do { (si)->compares++; } while (0)
+# define xfarray_sort_bump_heapsorts(si) do { (si)->heapsorts++; } while (0)
+#else
+# define xfarray_sort_bump_loads(si)
+# define xfarray_sort_bump_stores(si)
+# define xfarray_sort_bump_compares(si)
+# define xfarray_sort_bump_heapsorts(si)
+#endif /* DEBUG */
+
+/* Load an array element for sorting. */
+static inline int
+xfarray_sort_load(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ xfarray_sort_bump_loads(si);
+ return xfarray_load(si->array, idx, ptr);
+}
+
+/* Store an array element for sorting. */
+static inline int
+xfarray_sort_store(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ xfarray_sort_bump_stores(si);
+ return xfarray_store(si->array, idx, ptr);
+}
+
+/* Compare an array element for sorting. */
+static inline int
+xfarray_sort_cmp(
+ struct xfarray_sortinfo *si,
+ const void *a,
+ const void *b)
+{
+ xfarray_sort_bump_compares(si);
+ return si->cmp_fn(a, b);
+}
+
+/* Return a pointer to the low index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_lo(struct xfarray_sortinfo *si)
+{
+ return (xfarray_idx_t *)(si + 1);
+}
+
+/* Return a pointer to the high index stack for quicksort partitioning. */
+static inline xfarray_idx_t *xfarray_sortinfo_hi(struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_lo(si) + si->max_stack_depth;
+}
+
+/* Size of each element in the quicksort pivot array. */
+static inline size_t
+xfarray_pivot_rec_sz(
+ struct xfarray *array)
+{
+ return round_up(array->obj_size, 8) + sizeof(xfarray_idx_t);
+}
+
+/* Allocate memory to handle the sort. */
+static inline int
+xfarray_sortinfo_alloc(
+ struct xfarray *array,
+ xfarray_cmp_fn cmp_fn,
+ unsigned int flags,
+ struct xfarray_sortinfo **infop)
+{
+ struct xfarray_sortinfo *si;
+ size_t nr_bytes = sizeof(struct xfarray_sortinfo);
+ size_t pivot_rec_sz = xfarray_pivot_rec_sz(array);
+ int max_stack_depth;
+
+ /*
+ * The median-of-nine pivot algorithm doesn't work if a subset has
+ * fewer than 9 items. Make sure the in-memory sort will always take
+ * over for subsets where this wouldn't be the case.
+ */
+ BUILD_BUG_ON(XFARRAY_QSORT_PIVOT_NR >= XFARRAY_ISORT_NR);
+
+ /*
+ * Tail-call recursion during the partitioning phase means that
+ * quicksort will never recurse more than log2(nr) times. We need one
+ * extra level of stack to hold the initial parameters. In-memory
+ * sort will always take care of the last few levels of recursion for
+ * us, so we can reduce the stack depth by that much.
+ */
+ max_stack_depth = ilog2(array->nr) + 1 - (XFARRAY_ISORT_SHIFT - 1);
+ if (max_stack_depth < 1)
+ max_stack_depth = 1;
+
+ /* Each level of quicksort uses a lo and a hi index */
+ nr_bytes += max_stack_depth * sizeof(xfarray_idx_t) * 2;
+
+ /* Scratchpad for in-memory sort, or finding the pivot */
+ nr_bytes += max_t(size_t,
+ (XFARRAY_QSORT_PIVOT_NR + 1) * pivot_rec_sz,
+ XFARRAY_ISORT_NR * array->obj_size);
+
+ si = kvzalloc(nr_bytes, XCHK_GFP_FLAGS);
+ if (!si)
+ return -ENOMEM;
+
+ si->array = array;
+ si->cmp_fn = cmp_fn;
+ si->flags = flags;
+ si->max_stack_depth = max_stack_depth;
+ si->max_stack_used = 1;
+
+ xfarray_sortinfo_lo(si)[0] = 0;
+ xfarray_sortinfo_hi(si)[0] = array->nr - 1;
+
+ trace_xfarray_sort(si, nr_bytes);
+ *infop = si;
+ return 0;
+}
+
+/* Should this sort be terminated by a fatal signal? */
+static inline bool
+xfarray_sort_terminated(
+ struct xfarray_sortinfo *si,
+ int *error)
+{
+ /*
+ * If preemption is disabled, we need to yield to the scheduler every
+ * few seconds so that we don't run afoul of the soft lockup watchdog
+ * or RCU stall detector.
+ */
+ cond_resched();
+
+ if ((si->flags & XFARRAY_SORT_KILLABLE) &&
+ fatal_signal_pending(current)) {
+ if (*error == 0)
+ *error = -EINTR;
+ return true;
+ }
+ return false;
+}
+
+/* Do we want an in-memory sort? */
+static inline bool
+xfarray_want_isort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t start,
+ xfarray_idx_t end)
+{
+ /*
+ * For array subsets that fit in the scratchpad, it's much faster to
+ * use the kernel's heapsort than quicksort's stack machine.
+ */
+ return (end - start) < XFARRAY_ISORT_NR;
+}
+
+/* Return the scratch space within the sortinfo structure. */
+static inline void *xfarray_sortinfo_isort_scratch(struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/*
+ * Sort a small number of array records using scratchpad memory. The records
+ * need not be contiguous in the xfile's memory pages.
+ */
+STATIC int
+xfarray_isort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ void *scratch = xfarray_sortinfo_isort_scratch(si);
+ loff_t lo_pos = xfarray_pos(si->array, lo);
+ loff_t len = xfarray_pos(si->array, hi - lo + 1);
+ int error;
+
+ trace_xfarray_isort(si, lo, hi);
+
+ xfarray_sort_bump_loads(si);
+ error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+ if (error)
+ return error;
+
+ xfarray_sort_bump_heapsorts(si);
+ sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
+
+ xfarray_sort_bump_stores(si);
+ return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
+}
+
+/* Grab a page for sorting records. */
+static inline int
+xfarray_sort_get_page(
+ struct xfarray_sortinfo *si,
+ loff_t pos,
+ uint64_t len)
+{
+ int error;
+
+ error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
+ if (error)
+ return error;
+
+ /*
+ * xfile pages must never be mapped into userspace, so we skip the
+ * dcache flush when mapping the page.
+ */
+ si->page_kaddr = kmap_local_page(si->xfpage.page);
+ return 0;
+}
+
+/* Release a page we grabbed for sorting records. */
+static inline int
+xfarray_sort_put_page(
+ struct xfarray_sortinfo *si)
+{
+ if (!si->page_kaddr)
+ return 0;
+
+ kunmap_local(si->page_kaddr);
+ si->page_kaddr = NULL;
+
+ return xfile_put_page(si->array->xfile, &si->xfpage);
+}
+
+/* Decide if these records are eligible for in-page sorting. */
+static inline bool
+xfarray_want_pagesort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ pgoff_t lo_page;
+ pgoff_t hi_page;
+ loff_t end_pos;
+
+ /* We can only map one page at a time. */
+ lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
+ end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
+ hi_page = end_pos >> PAGE_SHIFT;
+
+ return lo_page == hi_page;
+}
+
+/* Sort a bunch of records that all live in the same memory page. */
+STATIC int
+xfarray_pagesort(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ void *startp;
+ loff_t lo_pos = xfarray_pos(si->array, lo);
+ uint64_t len = xfarray_pos(si->array, hi - lo);
+ int error = 0;
+
+ trace_xfarray_pagesort(si, lo, hi);
+
+ xfarray_sort_bump_loads(si);
+ error = xfarray_sort_get_page(si, lo_pos, len);
+ if (error)
+ return error;
+
+ xfarray_sort_bump_heapsorts(si);
+ startp = si->page_kaddr + offset_in_page(lo_pos);
+ sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
+
+ xfarray_sort_bump_stores(si);
+ return xfarray_sort_put_page(si);
+}
+
+/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
+static inline void *xfarray_sortinfo_pivot(struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_hi(si) + si->max_stack_depth;
+}
+
+/* Return a pointer to the start of the pivot array. */
+static inline void *
+xfarray_sortinfo_pivot_array(
+ struct xfarray_sortinfo *si)
+{
+ return xfarray_sortinfo_pivot(si) + si->array->obj_size;
+}
+
+/* The xfarray record is stored at the start of each pivot array element. */
+static inline void *
+xfarray_pivot_array_rec(
+ void *pa,
+ size_t pa_recsz,
+ unsigned int pa_idx)
+{
+ return pa + (pa_recsz * pa_idx);
+}
+
+/* The xfarray index is stored at the end of each pivot array element. */
+static inline xfarray_idx_t *
+xfarray_pivot_array_idx(
+ void *pa,
+ size_t pa_recsz,
+ unsigned int pa_idx)
+{
+ return xfarray_pivot_array_rec(pa, pa_recsz, pa_idx + 1) -
+ sizeof(xfarray_idx_t);
+}
+
+/*
+ * Find a pivot value for quicksort partitioning, swap it with a[lo], and save
+ * the cached pivot record for the next step.
+ *
+ * Load evenly-spaced records within the given range into memory, sort them,
+ * and choose the pivot from the median record. Using multiple points will
+ * improve the quality of the pivot selection, and hopefully avoid the worst
+ * quicksort behavior, since our array values are nearly always evenly sorted.
+ */
+STATIC int
+xfarray_qsort_pivot(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ void *pivot = xfarray_sortinfo_pivot(si);
+ void *parray = xfarray_sortinfo_pivot_array(si);
+ void *recp;
+ xfarray_idx_t *idxp;
+ xfarray_idx_t step = (hi - lo) / (XFARRAY_QSORT_PIVOT_NR - 1);
+ size_t pivot_rec_sz = xfarray_pivot_rec_sz(si->array);
+ int i, j;
+ int error;
+
+ ASSERT(step > 0);
+
+ /*
+ * Load the xfarray indexes of the records we intend to sample into the
+ * pivot array.
+ */
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, 0);
+ *idxp = lo;
+ for (i = 1; i < XFARRAY_QSORT_PIVOT_NR - 1; i++) {
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+ *idxp = lo + (i * step);
+ }
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR - 1);
+ *idxp = hi;
+
+ /* Load the selected xfarray records into the pivot array. */
+ for (i = 0; i < XFARRAY_QSORT_PIVOT_NR; i++) {
+ xfarray_idx_t idx;
+
+ recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, i);
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+
+ /* No unset records; load directly into the array. */
+ if (likely(si->array->unset_slots == 0)) {
+ error = xfarray_sort_load(si, *idxp, recp);
+ if (error)
+ return error;
+ continue;
+ }
+
+ /*
+ * Load non-null records into the scratchpad without changing
+ * the xfarray_idx_t in the pivot array.
+ */
+ idx = *idxp;
+ xfarray_sort_bump_loads(si);
+ error = xfarray_load_next(si->array, &idx, recp);
+ if (error)
+ return error;
+ }
+
+ xfarray_sort_bump_heapsorts(si);
+ sort(parray, XFARRAY_QSORT_PIVOT_NR, pivot_rec_sz, si->cmp_fn, NULL);
+
+ /*
+ * We sorted the pivot array records (which includes the xfarray
+ * indices) in xfarray record order. The median element of the pivot
+ * array contains the xfarray record that we will use as the pivot.
+ * Copy that xfarray record to the designated space.
+ */
+ recp = xfarray_pivot_array_rec(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR / 2);
+ memcpy(pivot, recp, si->array->obj_size);
+
+ /* If the pivot record we chose was already in a[lo] then we're done. */
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR / 2);
+ if (*idxp == lo)
+ return 0;
+
+ /*
+ * Find the cached copy of a[lo] in the pivot array so that we can swap
+ * a[lo] and a[pivot].
+ */
+ for (i = 0, j = -1; i < XFARRAY_QSORT_PIVOT_NR; i++) {
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz, i);
+ if (*idxp == lo)
+ j = i;
+ }
+ if (j < 0) {
+ ASSERT(j >= 0);
+ return -EFSCORRUPTED;
+ }
+
+ /* Swap a[lo] and a[pivot]. */
+ error = xfarray_sort_store(si, lo, pivot);
+ if (error)
+ return error;
+
+ recp = xfarray_pivot_array_rec(parray, pivot_rec_sz, j);
+ idxp = xfarray_pivot_array_idx(parray, pivot_rec_sz,
+ XFARRAY_QSORT_PIVOT_NR / 2);
+ return xfarray_sort_store(si, *idxp, recp);
+}
+
+/*
+ * Set up the pointers for the next iteration. We push onto the stack all of
+ * the unsorted values between a[lo + 1] and a[end[i]], and we tweak the
+ * current stack frame to point to the unsorted values between a[beg[i]] and
+ * a[lo] so that those values will be sorted when we pop the stack.
+ */
+static inline int
+xfarray_qsort_push(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t *si_lo,
+ xfarray_idx_t *si_hi,
+ xfarray_idx_t lo,
+ xfarray_idx_t hi)
+{
+ /* Check for stack overflows */
+ if (si->stack_depth >= si->max_stack_depth - 1) {
+ ASSERT(si->stack_depth < si->max_stack_depth - 1);
+ return -EFSCORRUPTED;
+ }
+
+ si->max_stack_used = max_t(uint8_t, si->max_stack_used,
+ si->stack_depth + 2);
+
+ si_lo[si->stack_depth + 1] = lo + 1;
+ si_hi[si->stack_depth + 1] = si_hi[si->stack_depth];
+ si_hi[si->stack_depth++] = lo - 1;
+
+ /*
+ * Always start with the smaller of the two partitions to keep the
+ * amount of recursion in check.
+ */
+ if (si_hi[si->stack_depth] - si_lo[si->stack_depth] >
+ si_hi[si->stack_depth - 1] - si_lo[si->stack_depth - 1]) {
+ swap(si_lo[si->stack_depth], si_lo[si->stack_depth - 1]);
+ swap(si_hi[si->stack_depth], si_hi[si->stack_depth - 1]);
+ }
+
+ return 0;
+}
+
+/*
+ * Load an element from the array into the first scratchpad and cache the page,
+ * if possible.
+ */
+static inline int
+xfarray_sort_load_cached(
+ struct xfarray_sortinfo *si,
+ xfarray_idx_t idx,
+ void *ptr)
+{
+ loff_t idx_pos = xfarray_pos(si->array, idx);
+ pgoff_t startpage;
+ pgoff_t endpage;
+ int error = 0;
+
+ /*
+ * If this load would split a page, release the cached page, if any,
+ * and perform a traditional read.
+ */
+ startpage = idx_pos >> PAGE_SHIFT;
+ endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
+ if (startpage != endpage) {
+ error = xfarray_sort_put_page(si);
+ if (error)
+ return error;
+
+ if (xfarray_sort_terminated(si, &error))
+ return error;
+
+ return xfile_obj_load(si->array->xfile, ptr,
+ si->array->obj_size, idx_pos);
+ }
+
+ /* If the cached page is not the one we want, release it. */
+ if (xfile_page_cached(&si->xfpage) &&
+ xfile_page_index(&si->xfpage) != startpage) {
+ error = xfarray_sort_put_page(si);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If we don't have a cached page (and we know the load is contained
+ * in a single page) then grab it.
+ */
+ if (!xfile_page_cached(&si->xfpage)) {
+ if (xfarray_sort_terminated(si, &error))
+ return error;
+
+ error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
+ PAGE_SIZE);
+ if (error)
+ return error;
+ }
+
+ memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
+ si->array->obj_size);
+ return 0;
+}
+
+/*
+ * Sort the array elements via quicksort. This implementation incorporates
+ * four optimizations discussed in Sedgewick:
+ *
+ * 1. Use an explicit stack of array indices to store the next array partition
+ * to sort. This helps us to avoid recursion in the call stack, which is
+ * particularly expensive in the kernel.
+ *
+ * 2. For arrays with records in arbitrary or user-controlled order, choose the
+ * pivot element using a median-of-nine decision tree. This reduces the
+ * probability of selecting a bad pivot value which causes worst case
+ * behavior (i.e. partition sizes of 1).
+ *
+ * 3. The smaller of the two sub-partitions is pushed onto the stack to start
+ * the next level of recursion, and the larger sub-partition replaces the
+ * current stack frame. This guarantees that we won't need more than
+ * log2(nr) stack space.
+ *
+ * 4. For small sets, load the records into the scratchpad and run heapsort on
+ * them because that is very fast. In the author's experience, this yields
+ * a ~10% reduction in runtime.
+ *
+ * If a small set is contained entirely within a single xfile memory page,
+ * map the page directly and run heap sort directly on the xfile page
+ * instead of using the load/store interface. This halves the runtime.
+ *
+ * 5. This optimization is specific to the implementation. When converging lo
+ * and hi after selecting a pivot, we will try to retain the xfile memory
+ * page between load calls, which reduces run time by 50%.
+ */
+
+/*
+ * Due to the use of signed indices, we can only support up to 2^63 records.
+ * Files can only grow to 2^63 bytes, so this is not much of a limitation.
+ */
+#define QSORT_MAX_RECS (1ULL << 63)
+
+int
+xfarray_sort(
+ struct xfarray *array,
+ xfarray_cmp_fn cmp_fn,
+ unsigned int flags)
+{
+ struct xfarray_sortinfo *si;
+ xfarray_idx_t *si_lo, *si_hi;
+ void *pivot;
+ void *scratch = xfarray_scratch(array);
+ xfarray_idx_t lo, hi;
+ int error = 0;
+
+ if (array->nr < 2)
+ return 0;
+ if (array->nr >= QSORT_MAX_RECS)
+ return -E2BIG;
+
+ error = xfarray_sortinfo_alloc(array, cmp_fn, flags, &si);
+ if (error)
+ return error;
+ si_lo = xfarray_sortinfo_lo(si);
+ si_hi = xfarray_sortinfo_hi(si);
+ pivot = xfarray_sortinfo_pivot(si);
+
+ while (si->stack_depth >= 0) {
+ lo = si_lo[si->stack_depth];
+ hi = si_hi[si->stack_depth];
+
+ trace_xfarray_qsort(si, lo, hi);
+
+ /* Nothing left in this partition to sort; pop stack. */
+ if (lo >= hi) {
+ si->stack_depth--;
+ continue;
+ }
+
+ /*
+ * If directly mapping the page and sorting can solve our
+ * problems, we're done.
+ */
+ if (xfarray_want_pagesort(si, lo, hi)) {
+ error = xfarray_pagesort(si, lo, hi);
+ if (error)
+ goto out_free;
+ si->stack_depth--;
+ continue;
+ }
+
+ /* If insertion sort can solve our problems, we're done. */
+ if (xfarray_want_isort(si, lo, hi)) {
+ error = xfarray_isort(si, lo, hi);
+ if (error)
+ goto out_free;
+ si->stack_depth--;
+ continue;
+ }
+
+ /* Pick a pivot, move it to a[lo] and stash it. */
+ error = xfarray_qsort_pivot(si, lo, hi);
+ if (error)
+ goto out_free;
+
+ /*
+ * Rearrange a[lo..hi] such that everything smaller than the
+ * pivot is on the left side of the range and everything larger
+ * than the pivot is on the right side of the range.
+ */
+ while (lo < hi) {
+ /*
+ * Decrement hi until it finds an a[hi] less than the
+ * pivot value.
+ */
+ error = xfarray_sort_load_cached(si, hi, scratch);
+ if (error)
+ goto out_free;
+ while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
+ lo < hi) {
+ hi--;
+ error = xfarray_sort_load_cached(si, hi,
+ scratch);
+ if (error)
+ goto out_free;
+ }
+ error = xfarray_sort_put_page(si);
+ if (error)
+ goto out_free;
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+
+ /* Copy that item (a[hi]) to a[lo]. */
+ if (lo < hi) {
+ error = xfarray_sort_store(si, lo++, scratch);
+ if (error)
+ goto out_free;
+ }
+
+ /*
+ * Increment lo until it finds an a[lo] greater than
+ * the pivot value.
+ */
+ error = xfarray_sort_load_cached(si, lo, scratch);
+ if (error)
+ goto out_free;
+ while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
+ lo < hi) {
+ lo++;
+ error = xfarray_sort_load_cached(si, lo,
+ scratch);
+ if (error)
+ goto out_free;
+ }
+ error = xfarray_sort_put_page(si);
+ if (error)
+ goto out_free;
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+
+ /* Copy that item (a[lo]) to a[hi]. */
+ if (lo < hi) {
+ error = xfarray_sort_store(si, hi--, scratch);
+ if (error)
+ goto out_free;
+ }
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+ }
+
+ /*
+ * Put our pivot value in the correct place at a[lo]. All
+ * values between a[beg[i]] and a[lo - 1] should be less than
+ * the pivot; and all values between a[lo + 1] and a[end[i]-1]
+ * should be greater than the pivot.
+ */
+ error = xfarray_sort_store(si, lo, pivot);
+ if (error)
+ goto out_free;
+
+ /* Set up the stack frame to process the two partitions. */
+ error = xfarray_qsort_push(si, si_lo, si_hi, lo, hi);
+ if (error)
+ goto out_free;
+
+ if (xfarray_sort_terminated(si, &error))
+ goto out_free;
+ }
+
+out_free:
+ trace_xfarray_sort_stats(si, error);
+ kvfree(si);
+ return error;
+}
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
new file mode 100644
index 000000000000..4ecac01363d9
--- /dev/null
+++ b/fs/xfs/scrub/xfarray.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2021-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFARRAY_H__
+#define __XFS_SCRUB_XFARRAY_H__
+
+/* xfile array index type, along with cursor initialization */
+typedef uint64_t xfarray_idx_t;
+#define XFARRAY_CURSOR_INIT ((__force xfarray_idx_t)0)
+
+/* Iterate each index of an xfile array. */
+#define foreach_xfarray_idx(array, idx) \
+ for ((idx) = XFARRAY_CURSOR_INIT; \
+ (idx) < xfarray_length(array); \
+ (idx)++)
+
+struct xfarray {
+ /* Underlying file that backs the array. */
+ struct xfile *xfile;
+
+ /* Number of array elements. */
+ xfarray_idx_t nr;
+
+ /* Maximum possible array size. */
+ xfarray_idx_t max_nr;
+
+ /* Number of unset slots in the array below @nr. */
+ uint64_t unset_slots;
+
+ /* Size of an array element. */
+ size_t obj_size;
+
+ /* log2 of array element size, if possible. */
+ int obj_size_log;
+};
+
+int xfarray_create(const char *descr, unsigned long long required_capacity,
+ size_t obj_size, struct xfarray **arrayp);
+void xfarray_destroy(struct xfarray *array);
+int xfarray_load(struct xfarray *array, xfarray_idx_t idx, void *ptr);
+int xfarray_unset(struct xfarray *array, xfarray_idx_t idx);
+int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
+int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
+bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
+
+/* Append an element to the array. */
+static inline int xfarray_append(struct xfarray *array, const void *ptr)
+{
+ return xfarray_store(array, array->nr, ptr);
+}
+
+uint64_t xfarray_length(struct xfarray *array);
+int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
+
+/* Declarations for xfile array sort functionality. */
+
+typedef cmp_func_t xfarray_cmp_fn;
+
+/* Perform an in-memory heapsort for small subsets. */
+#define XFARRAY_ISORT_SHIFT (4)
+#define XFARRAY_ISORT_NR (1U << XFARRAY_ISORT_SHIFT)
+
+/* Evalulate this many points to find the qsort pivot. */
+#define XFARRAY_QSORT_PIVOT_NR (9)
+
+struct xfarray_sortinfo {
+ struct xfarray *array;
+
+ /* Comparison function for the sort. */
+ xfarray_cmp_fn cmp_fn;
+
+ /* Maximum height of the partition stack. */
+ uint8_t max_stack_depth;
+
+ /* Current height of the partition stack. */
+ int8_t stack_depth;
+
+ /* Maximum stack depth ever used. */
+ uint8_t max_stack_used;
+
+ /* XFARRAY_SORT_* flags; see below. */
+ unsigned int flags;
+
+ /* Cache a page here for faster access. */
+ struct xfile_page xfpage;
+ void *page_kaddr;
+
+#ifdef DEBUG
+ /* Performance statistics. */
+ uint64_t loads;
+ uint64_t stores;
+ uint64_t compares;
+ uint64_t heapsorts;
+#endif
+ /*
+ * Extra bytes are allocated beyond the end of the structure to store
+ * quicksort information. C does not permit multiple VLAs per struct,
+ * so we document all of this in a comment.
+ *
+ * Pretend that we have a typedef for array records:
+ *
+ * typedef char[array->obj_size] xfarray_rec_t;
+ *
+ * First comes the quicksort partition stack:
+ *
+ * xfarray_idx_t lo[max_stack_depth];
+ * xfarray_idx_t hi[max_stack_depth];
+ *
+ * union {
+ *
+ * If for a given subset we decide to use an in-memory sort, we use a
+ * block of scratchpad records here to compare items:
+ *
+ * xfarray_rec_t scratch[ISORT_NR];
+ *
+ * Otherwise, we want to partition the records to partition the array.
+ * We store the chosen pivot record at the start of the scratchpad area
+ * and use the rest to sample some records to estimate the median.
+ * The format of the qsort_pivot array enables us to use the kernel
+ * heapsort function to place the median value in the middle.
+ *
+ * struct {
+ * xfarray_rec_t pivot;
+ * struct {
+ * xfarray_rec_t rec; (rounded up to 8 bytes)
+ * xfarray_idx_t idx;
+ * } qsort_pivot[QSORT_PIVOT_NR];
+ * };
+ * }
+ */
+};
+
+/* Sort can be interrupted by a fatal signal. */
+#define XFARRAY_SORT_KILLABLE (1U << 0)
+
+int xfarray_sort(struct xfarray *array, xfarray_cmp_fn cmp_fn,
+ unsigned int flags);
+
+#endif /* __XFS_SCRUB_XFARRAY_H__ */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
new file mode 100644
index 000000000000..090c3ead43fd
--- /dev/null
+++ b/fs/xfs/scrub/xfile.c
@@ -0,0 +1,419 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+#include <linux/shmem_fs.h>
+
+/*
+ * Swappable Temporary Memory
+ * ==========================
+ *
+ * Online checking sometimes needs to be able to stage a large amount of data
+ * in memory. This information might not fit in the available memory and it
+ * doesn't all need to be accessible at all times. In other words, we want an
+ * indexed data buffer to store data that can be paged out.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements. Therefore, the xfile mechanism uses an unlinked shmem file to
+ * store our staging data. This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xfile must be freed with xfile_destroy.
+ *
+ * xfiles assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken. Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
+ * of a hole cause a page to be mapped into the file. If you are going to
+ * create a sparse xfile, please be careful about reading from uninitialized
+ * parts of the file. These pages are !Uptodate and will eventually be
+ * reclaimed if not written, but in the short term this boosts memory
+ * consumption.
+ */
+
+/*
+ * xfiles must not be exposed to userspace and require upper layers to
+ * coordinate access to the one handle returned by the constructor, so
+ * establish a separate lock class for xfiles to avoid confusing lockdep.
+ */
+static struct lock_class_key xfile_i_mutex_key;
+
+/*
+ * Create an xfile of the given size. The description will be used in the
+ * trace output.
+ */
+int
+xfile_create(
+ const char *description,
+ loff_t isize,
+ struct xfile **xfilep)
+{
+ struct inode *inode;
+ struct xfile *xf;
+ int error = -ENOMEM;
+
+ xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
+ if (!xf)
+ return -ENOMEM;
+
+ xf->file = shmem_file_setup(description, isize, 0);
+ if (!xf->file)
+ goto out_xfile;
+ if (IS_ERR(xf->file)) {
+ error = PTR_ERR(xf->file);
+ goto out_xfile;
+ }
+
+ /*
+ * We want a large sparse file that we can pread, pwrite, and seek.
+ * xfile users are responsible for keeping the xfile hidden away from
+ * all other callers, so we skip timestamp updates and security checks.
+ * Make the inode only accessible by root, just in case the xfile ever
+ * escapes.
+ */
+ xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
+ FMODE_LSEEK;
+ xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
+ inode = file_inode(xf->file);
+ inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
+ inode->i_mode &= ~0177;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+
+ lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
+
+ trace_xfile_create(xf);
+
+ *xfilep = xf;
+ return 0;
+out_xfile:
+ kfree(xf);
+ return error;
+}
+
+/* Close the file and release all resources. */
+void
+xfile_destroy(
+ struct xfile *xf)
+{
+ struct inode *inode = file_inode(xf->file);
+
+ trace_xfile_destroy(xf);
+
+ lockdep_set_class(&inode->i_rwsem, &inode->i_sb->s_type->i_mutex_key);
+ fput(xf->file);
+ kfree(xf);
+}
+
+/*
+ * Read a memory object directly from the xfile's page cache. Unlike regular
+ * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
+ * high an offset, instead of truncating the read. Otherwise, we return
+ * bytes read or an error code, like regular pread.
+ */
+ssize_t
+xfile_pread(
+ struct xfile *xf,
+ void *buf,
+ size_t count,
+ loff_t pos)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ struct page *page = NULL;
+ ssize_t read = 0;
+ unsigned int pflags;
+ int error = 0;
+
+ if (count > MAX_RW_COUNT)
+ return -E2BIG;
+ if (inode->i_sb->s_maxbytes - pos < count)
+ return -EFBIG;
+
+ trace_xfile_pread(xf, pos, count);
+
+ pflags = memalloc_nofs_save();
+ while (count > 0) {
+ void *p, *kaddr;
+ unsigned int len;
+
+ len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
+
+ /*
+ * In-kernel reads of a shmem file cause it to allocate a page
+ * if the mapping shows a hole. Therefore, if we hit ENOMEM
+ * we can continue by zeroing the caller's buffer.
+ */
+ page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
+ __GFP_NOWARN);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
+ if (error != -ENOMEM)
+ break;
+
+ memset(buf, 0, len);
+ goto advance;
+ }
+
+ if (PageUptodate(page)) {
+ /*
+ * xfile pages must never be mapped into userspace, so
+ * we skip the dcache flush.
+ */
+ kaddr = kmap_local_page(page);
+ p = kaddr + offset_in_page(pos);
+ memcpy(buf, p, len);
+ kunmap_local(kaddr);
+ } else {
+ memset(buf, 0, len);
+ }
+ put_page(page);
+
+advance:
+ count -= len;
+ pos += len;
+ buf += len;
+ read += len;
+ }
+ memalloc_nofs_restore(pflags);
+
+ if (read > 0)
+ return read;
+ return error;
+}
+
+/*
+ * Write a memory object directly to the xfile's page cache. Unlike regular
+ * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
+ * high an offset, instead of truncating the write. Otherwise, we return
+ * bytes written or an error code, like regular pwrite.
+ */
+ssize_t
+xfile_pwrite(
+ struct xfile *xf,
+ const void *buf,
+ size_t count,
+ loff_t pos)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ struct page *page = NULL;
+ ssize_t written = 0;
+ unsigned int pflags;
+ int error = 0;
+
+ if (count > MAX_RW_COUNT)
+ return -E2BIG;
+ if (inode->i_sb->s_maxbytes - pos < count)
+ return -EFBIG;
+
+ trace_xfile_pwrite(xf, pos, count);
+
+ pflags = memalloc_nofs_save();
+ while (count > 0) {
+ void *fsdata = NULL;
+ void *p, *kaddr;
+ unsigned int len;
+ int ret;
+
+ len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
+
+ /*
+ * We call write_begin directly here to avoid all the freezer
+ * protection lock-taking that happens in the normal path.
+ * shmem doesn't support fs freeze, but lockdep doesn't know
+ * that and will trip over that.
+ */
+ error = aops->write_begin(NULL, mapping, pos, len, &page,
+ &fsdata);
+ if (error)
+ break;
+
+ /*
+ * xfile pages must never be mapped into userspace, so we skip
+ * the dcache flush. If the page is not uptodate, zero it
+ * before writing data.
+ */
+ kaddr = kmap_local_page(page);
+ if (!PageUptodate(page)) {
+ memset(kaddr, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ }
+ p = kaddr + offset_in_page(pos);
+ memcpy(p, buf, len);
+ kunmap_local(kaddr);
+
+ ret = aops->write_end(NULL, mapping, pos, len, len, page,
+ fsdata);
+ if (ret < 0) {
+ error = ret;
+ break;
+ }
+
+ written += ret;
+ if (ret != len)
+ break;
+
+ count -= ret;
+ pos += ret;
+ buf += ret;
+ }
+ memalloc_nofs_restore(pflags);
+
+ if (written > 0)
+ return written;
+ return error;
+}
+
+/* Find the next written area in the xfile data for a given offset. */
+loff_t
+xfile_seek_data(
+ struct xfile *xf,
+ loff_t pos)
+{
+ loff_t ret;
+
+ ret = vfs_llseek(xf->file, pos, SEEK_DATA);
+ trace_xfile_seek_data(xf, pos, ret);
+ return ret;
+}
+
+/* Query stat information for an xfile. */
+int
+xfile_stat(
+ struct xfile *xf,
+ struct xfile_stat *statbuf)
+{
+ struct kstat ks;
+ int error;
+
+ error = vfs_getattr_nosec(&xf->file->f_path, &ks,
+ STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
+ if (error)
+ return error;
+
+ statbuf->size = ks.size;
+ statbuf->bytes = ks.blocks << SECTOR_SHIFT;
+ return 0;
+}
+
+/*
+ * Grab the (locked) page for a memory object. The object cannot span a page
+ * boundary. Returns 0 (and a locked page) if successful, -ENOTBLK if we
+ * cannot grab the page, or the usual negative errno.
+ */
+int
+xfile_get_page(
+ struct xfile *xf,
+ loff_t pos,
+ unsigned int len,
+ struct xfile_page *xfpage)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ struct page *page = NULL;
+ void *fsdata = NULL;
+ loff_t key = round_down(pos, PAGE_SIZE);
+ unsigned int pflags;
+ int error;
+
+ if (inode->i_sb->s_maxbytes - pos < len)
+ return -ENOMEM;
+ if (len > PAGE_SIZE - offset_in_page(pos))
+ return -ENOTBLK;
+
+ trace_xfile_get_page(xf, pos, len);
+
+ pflags = memalloc_nofs_save();
+
+ /*
+ * We call write_begin directly here to avoid all the freezer
+ * protection lock-taking that happens in the normal path. shmem
+ * doesn't support fs freeze, but lockdep doesn't know that and will
+ * trip over that.
+ */
+ error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
+ &fsdata);
+ if (error)
+ goto out_pflags;
+
+ /* We got the page, so make sure we push out EOF. */
+ if (i_size_read(inode) < pos + len)
+ i_size_write(inode, pos + len);
+
+ /*
+ * If the page isn't up to date, fill it with zeroes before we hand it
+ * to the caller and make sure the backing store will hold on to them.
+ */
+ if (!PageUptodate(page)) {
+ void *kaddr;
+
+ kaddr = kmap_local_page(page);
+ memset(kaddr, 0, PAGE_SIZE);
+ kunmap_local(kaddr);
+ SetPageUptodate(page);
+ }
+
+ /*
+ * Mark each page dirty so that the contents are written to some
+ * backing store when we drop this buffer, and take an extra reference
+ * to prevent the xfile page from being swapped or removed from the
+ * page cache by reclaim if the caller unlocks the page.
+ */
+ set_page_dirty(page);
+ get_page(page);
+
+ xfpage->page = page;
+ xfpage->fsdata = fsdata;
+ xfpage->pos = key;
+out_pflags:
+ memalloc_nofs_restore(pflags);
+ return error;
+}
+
+/*
+ * Release the (locked) page for a memory object. Returns 0 or a negative
+ * errno.
+ */
+int
+xfile_put_page(
+ struct xfile *xf,
+ struct xfile_page *xfpage)
+{
+ struct inode *inode = file_inode(xf->file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *aops = mapping->a_ops;
+ unsigned int pflags;
+ int ret;
+
+ trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
+
+ /* Give back the reference that we took in xfile_get_page. */
+ put_page(xfpage->page);
+
+ pflags = memalloc_nofs_save();
+ ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
+ xfpage->page, xfpage->fsdata);
+ memalloc_nofs_restore(pflags);
+ memset(xfpage, 0, sizeof(struct xfile_page));
+
+ if (ret < 0)
+ return ret;
+ if (ret != PAGE_SIZE)
+ return -EIO;
+ return 0;
+}
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
new file mode 100644
index 000000000000..d56643b0f429
--- /dev/null
+++ b/fs/xfs/scrub/xfile.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_XFILE_H__
+#define __XFS_SCRUB_XFILE_H__
+
+struct xfile_page {
+ struct page *page;
+ void *fsdata;
+ loff_t pos;
+};
+
+static inline bool xfile_page_cached(const struct xfile_page *xfpage)
+{
+ return xfpage->page != NULL;
+}
+
+static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
+{
+ return xfpage->page->index;
+}
+
+struct xfile {
+ struct file *file;
+};
+
+int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
+void xfile_destroy(struct xfile *xf);
+
+ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
+ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+ loff_t pos);
+
+/*
+ * Load an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+static inline int
+xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
+{
+ ssize_t ret = xfile_pread(xf, buf, count, pos);
+
+ if (ret < 0 || ret != count)
+ return -ENOMEM;
+ return 0;
+}
+
+/*
+ * Store an object. Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
+ */
+static inline int
+xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
+{
+ ssize_t ret = xfile_pwrite(xf, buf, count, pos);
+
+ if (ret < 0 || ret != count)
+ return -ENOMEM;
+ return 0;
+}
+
+loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
+
+struct xfile_stat {
+ loff_t size;
+ unsigned long long bytes;
+};
+
+int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+
+int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
+ struct xfile_page *xbuf);
+int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+
+#endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 791db7d9c849..6b840301817a 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -233,7 +233,7 @@ xfs_acl_set_mode(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
inode->i_mode = mode;
- inode->i_ctime = current_time(inode);
+ inode_set_ctime_current(inode);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (xfs_has_wsync(mp))
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 451942fb38ec..465d7630bb21 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -478,7 +478,7 @@ xfs_discard_folio(
folio, ip->i_ino, pos);
/*
- * The end of the punch range is always the offset of the the first
+ * The end of the punch range is always the offset of the first
* byte of the next folio. Hence the end offset is only dependent on the
* folio itself and not the start offset that is passed in.
*/
@@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = {
.read_folio = xfs_vm_read_folio,
.readahead = xfs_vm_readahead,
.writepages = xfs_vm_writepages,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.bmap = xfs_vm_bmap,
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 5db87b34fb6e..89c7a9f4f930 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -333,7 +333,6 @@ xfs_attr_inactive(
int error = 0;
mp = dp->i_mount;
- ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, lock_mode);
if (!xfs_inode_has_attr_fork(dp))
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 2788a6f2edcd..36fe2abb16e6 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -547,7 +547,7 @@ xfs_attri_item_recover(
struct xfs_inode *ip;
struct xfs_da_args *args;
struct xfs_trans *tp;
- struct xfs_trans_res tres;
+ struct xfs_trans_res resv;
struct xfs_attri_log_format *attrp;
struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
int error;
@@ -618,8 +618,9 @@ xfs_attri_item_recover(
goto out;
}
- xfs_init_attr_trans(args, &tres, &total);
- error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ xfs_init_attr_trans(args, &resv, &total);
+ resv = xlog_recover_resv(&resv);
+ error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
goto out;
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 7551c3ec4ea5..e736a0844c89 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -490,6 +490,7 @@ xfs_bui_item_recover(
struct list_head *capture_list)
{
struct xfs_bmap_intent fake = { };
+ struct xfs_trans_res resv;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
@@ -515,7 +516,8 @@ xfs_bui_item_recover(
return error;
/* Allocate transaction and do the work. */
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv,
XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
if (error)
goto err_rele;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index fbb675563208..fcefab687285 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1644,6 +1644,7 @@ xfs_swap_extents(
uint64_t f;
int resblks = 0;
unsigned int flags = 0;
+ struct timespec64 ctime;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1756,8 +1757,9 @@ xfs_swap_extents(
* process that the file was not changed out from
* under it.
*/
- if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
- (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+ ctime = inode_get_ctime(VFS_I(ip));
+ if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) ||
+ (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) ||
(sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
(sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
error = -EBUSY;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 15d1e5a7c2d3..c1ece4a08ff4 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -481,7 +481,8 @@ _xfs_buf_obj_cmp(
* reallocating a busy extent. Skip this buffer and
* continue searching for an exact match.
*/
- ASSERT(bp->b_flags & XBF_STALE);
+ if (!(map->bm_flags & XBM_LIVESCAN))
+ ASSERT(bp->b_flags & XBF_STALE);
return 1;
}
return 0;
@@ -559,6 +560,10 @@ xfs_buf_find_lock(
* intact here.
*/
if (bp->b_flags & XBF_STALE) {
+ if (flags & XBF_LIVESCAN) {
+ xfs_buf_unlock(bp);
+ return -ENOENT;
+ }
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
bp->b_ops = NULL;
@@ -682,6 +687,8 @@ xfs_buf_get_map(
int error;
int i;
+ if (flags & XBF_LIVESCAN)
+ cmap.bm_flags |= XBM_LIVESCAN;
for (i = 0; i < nmaps; i++)
cmap.bm_len += map[i].bm_len;
@@ -1938,14 +1945,17 @@ void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
+ struct block_device *bdev = btp->bt_bdev;
+
unregister_shrinker(&btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
- blkdev_issue_flush(btp->bt_bdev);
- invalidate_bdev(btp->bt_bdev);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
+ /* the main block device is closed by kill_block_super */
+ if (bdev != btp->bt_mount->m_super->s_bdev)
+ blkdev_put(bdev, btp->bt_mount->m_super);
kmem_free(btp);
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 549c60942208..df8f47953bb4 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -44,6 +44,11 @@ struct xfs_buf;
#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
/* flags used only as arguments to access routines */
+/*
+ * Online fsck is scanning the buffer cache for live buffers. Do not warn
+ * about length mismatches during lookups and do not return stale buffers.
+ */
+#define XBF_LIVESCAN (1u << 28)
#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
@@ -67,6 +72,7 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
+ { XBF_LIVESCAN, "LIVESCAN" }, \
{ XBF_INCORE, "INCORE" }, \
{ XBF_TRYLOCK, "TRYLOCK" }, \
{ XBF_UNMAPPED, "UNMAPPED" }
@@ -114,8 +120,15 @@ typedef struct xfs_buftarg {
struct xfs_buf_map {
xfs_daddr_t bm_bn; /* block number for I/O */
int bm_len; /* size of I/O */
+ unsigned int bm_flags;
};
+/*
+ * Online fsck is scanning the buffer cache for live buffers. Do not warn
+ * about length mismatches during lookups and do not return stale buffers.
+ */
+#define XBM_LIVESCAN (1U << 0)
+
#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index afc4c78b9eed..d5787991bb5b 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (C) 2010 Red Hat, Inc.
+ * Copyright (C) 2010, 2023 Red Hat, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
@@ -19,21 +19,147 @@
#include "xfs_log.h"
#include "xfs_ag.h"
-STATIC int
-xfs_trim_extents(
+/*
+ * Notes on an efficient, low latency fstrim algorithm
+ *
+ * We need to walk the filesystem free space and issue discards on the free
+ * space that meet the search criteria (size and location). We cannot issue
+ * discards on extents that might be in use, or are so recently in use they are
+ * still marked as busy. To serialise against extent state changes whilst we are
+ * gathering extents to trim, we must hold the AGF lock to lock out other
+ * allocations and extent free operations that might change extent state.
+ *
+ * However, we cannot just hold the AGF for the entire AG free space walk whilst
+ * we issue discards on each free space that is found. Storage devices can have
+ * extremely slow discard implementations (e.g. ceph RBD) and so walking a
+ * couple of million free extents and issuing synchronous discards on each
+ * extent can take a *long* time. Whilst we are doing this walk, nothing else
+ * can access the AGF, and we can stall transactions and hence the log whilst
+ * modifications wait for the AGF lock to be released. This can lead hung tasks
+ * kicking the hung task timer and rebooting the system. This is bad.
+ *
+ * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
+ * lock, gathers a range of inode cluster buffers that are allocated, drops the
+ * AGI lock and then reads all the inode cluster buffers and processes them. It
+ * loops doing this, using a cursor to keep track of where it is up to in the AG
+ * for each iteration to restart the INOBT lookup from.
+ *
+ * We can't do this exactly with free space - once we drop the AGF lock, the
+ * state of the free extent is out of our control and we cannot run a discard
+ * safely on it in this situation. Unless, of course, we've marked the free
+ * extent as busy and undergoing a discard operation whilst we held the AGF
+ * locked.
+ *
+ * This is exactly how online discard works - free extents are marked busy when
+ * they are freed, and once the extent free has been committed to the journal,
+ * the busy extent record is marked as "undergoing discard" and the discard is
+ * then issued on the free extent. Once the discard completes, the busy extent
+ * record is removed and the extent is able to be allocated again.
+ *
+ * In the context of fstrim, if we find a free extent we need to discard, we
+ * don't have to discard it immediately. All we need to do it record that free
+ * extent as being busy and under discard, and all the allocation routines will
+ * now avoid trying to allocate it. Hence if we mark the extent as busy under
+ * the AGF lock, we can safely discard it without holding the AGF lock because
+ * nothing will attempt to allocate that free space until the discard completes.
+ *
+ * This also allows us to issue discards asynchronously like we do with online
+ * discard, and so for fast devices fstrim will run much faster as we can have
+ * multiple discard operations in flight at once, as well as pipeline the free
+ * extent search so that it overlaps in flight discard IO.
+ */
+
+struct workqueue_struct *xfs_discard_wq;
+
+static void
+xfs_discard_endio_work(
+ struct work_struct *work)
+{
+ struct xfs_busy_extents *extents =
+ container_of(work, struct xfs_busy_extents, endio_work);
+
+ xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
+ kmem_free(extents->owner);
+}
+
+/*
+ * Queue up the actual completion to a thread to avoid IRQ-safe locking for
+ * pagb_lock.
+ */
+static void
+xfs_discard_endio(
+ struct bio *bio)
+{
+ struct xfs_busy_extents *extents = bio->bi_private;
+
+ INIT_WORK(&extents->endio_work, xfs_discard_endio_work);
+ queue_work(xfs_discard_wq, &extents->endio_work);
+ bio_put(bio);
+}
+
+/*
+ * Walk the discard list and issue discards on all the busy extents in the
+ * list. We plug and chain the bios so that we only need a single completion
+ * call to clear all the busy extents once the discards are complete.
+ */
+int
+xfs_discard_extents(
+ struct xfs_mount *mp,
+ struct xfs_busy_extents *extents)
+{
+ struct xfs_extent_busy *busyp;
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int error = 0;
+
+ blk_start_plug(&plug);
+ list_for_each_entry(busyp, &extents->extent_list, list) {
+ trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+ busyp->length);
+
+ error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+ XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+ XFS_FSB_TO_BB(mp, busyp->length),
+ GFP_NOFS, &bio);
+ if (error && error != -EOPNOTSUPP) {
+ xfs_info(mp,
+ "discard failed for extent [0x%llx,%u], error %d",
+ (unsigned long long)busyp->bno,
+ busyp->length,
+ error);
+ break;
+ }
+ }
+
+ if (bio) {
+ bio->bi_private = extents;
+ bio->bi_end_io = xfs_discard_endio;
+ submit_bio(bio);
+ } else {
+ xfs_discard_endio_work(&extents->endio_work);
+ }
+ blk_finish_plug(&plug);
+
+ return error;
+}
+
+
+static int
+xfs_trim_gather_extents(
struct xfs_perag *pag,
xfs_daddr_t start,
xfs_daddr_t end,
xfs_daddr_t minlen,
+ struct xfs_alloc_rec_incore *tcur,
+ struct xfs_busy_extents *extents,
uint64_t *blocks_trimmed)
{
struct xfs_mount *mp = pag->pag_mount;
- struct block_device *bdev = mp->m_ddev_targp->bt_bdev;
struct xfs_btree_cur *cur;
struct xfs_buf *agbp;
- struct xfs_agf *agf;
int error;
int i;
+ int batch = 100;
/*
* Force out the log. This means any transactions that might have freed
@@ -45,20 +171,28 @@ xfs_trim_extents(
error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
if (error)
return error;
- agf = agbp->b_addr;
cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
/*
- * Look up the longest btree in the AGF and start with it.
+ * Look up the extent length requested in the AGF and start with it.
*/
- error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i);
+ if (tcur->ar_startblock == NULLAGBLOCK)
+ error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i);
+ else
+ error = xfs_alloc_lookup_le(cur, tcur->ar_startblock,
+ tcur->ar_blockcount, &i);
if (error)
goto out_del_cursor;
+ if (i == 0) {
+ /* nothing of that length left in the AG, we are done */
+ tcur->ar_blockcount = 0;
+ goto out_del_cursor;
+ }
/*
* Loop until we are done with all extents that are large
- * enough to be worth discarding.
+ * enough to be worth discarding or we hit batch limits.
*/
while (i) {
xfs_agblock_t fbno;
@@ -73,7 +207,16 @@ xfs_trim_extents(
error = -EFSCORRUPTED;
break;
}
- ASSERT(flen <= be32_to_cpu(agf->agf_longest));
+
+ if (--batch <= 0) {
+ /*
+ * Update the cursor to point at this extent so we
+ * restart the next batch from this extent.
+ */
+ tcur->ar_startblock = fbno;
+ tcur->ar_blockcount = flen;
+ break;
+ }
/*
* use daddr format for all range/len calculations as that is
@@ -88,6 +231,7 @@ xfs_trim_extents(
*/
if (dlen < minlen) {
trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen);
+ tcur->ar_blockcount = 0;
break;
}
@@ -110,29 +254,103 @@ xfs_trim_extents(
goto next_extent;
}
- trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen);
- error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS);
- if (error)
- break;
+ xfs_extent_busy_insert_discard(pag, fbno, flen,
+ &extents->extent_list);
*blocks_trimmed += flen;
-
next_extent:
error = xfs_btree_decrement(cur, 0, &i);
if (error)
break;
- if (fatal_signal_pending(current)) {
- error = -ERESTARTSYS;
- break;
- }
+ /*
+ * If there's no more records in the tree, we are done. Set the
+ * cursor block count to 0 to indicate to the caller that there
+ * is no more extents to search.
+ */
+ if (i == 0)
+ tcur->ar_blockcount = 0;
}
+ /*
+ * If there was an error, release all the gathered busy extents because
+ * we aren't going to issue a discard on them any more.
+ */
+ if (error)
+ xfs_extent_busy_clear(mp, &extents->extent_list, false);
out_del_cursor:
xfs_btree_del_cursor(cur, error);
xfs_buf_relse(agbp);
return error;
}
+static bool
+xfs_trim_should_stop(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
+/*
+ * Iterate the free list gathering extents and discarding them. We need a cursor
+ * for the repeated iteration of gather/discard loop, so use the longest extent
+ * we found in the last batch as the key to start the next.
+ */
+static int
+xfs_trim_extents(
+ struct xfs_perag *pag,
+ xfs_daddr_t start,
+ xfs_daddr_t end,
+ xfs_daddr_t minlen,
+ uint64_t *blocks_trimmed)
+{
+ struct xfs_alloc_rec_incore tcur = {
+ .ar_blockcount = pag->pagf_longest,
+ .ar_startblock = NULLAGBLOCK,
+ };
+ int error = 0;
+
+ do {
+ struct xfs_busy_extents *extents;
+
+ extents = kzalloc(sizeof(*extents), GFP_KERNEL);
+ if (!extents) {
+ error = -ENOMEM;
+ break;
+ }
+
+ extents->mount = pag->pag_mount;
+ extents->owner = extents;
+ INIT_LIST_HEAD(&extents->extent_list);
+
+ error = xfs_trim_gather_extents(pag, start, end, minlen,
+ &tcur, extents, blocks_trimmed);
+ if (error) {
+ kfree(extents);
+ break;
+ }
+
+ /*
+ * We hand the extent list to the discard function here so the
+ * discarded extents can be removed from the busy extent list.
+ * This allows the discards to run asynchronously with gathering
+ * the next round of extents to discard.
+ *
+ * However, we must ensure that we do not reference the extent
+ * list after this function call, as it may have been freed by
+ * the time control returns to us.
+ */
+ error = xfs_discard_extents(pag->pag_mount, extents);
+ if (error)
+ break;
+
+ if (xfs_trim_should_stop())
+ break;
+
+ } while (tcur.ar_blockcount != 0);
+
+ return error;
+
+}
+
/*
* trim a range of the filesystem.
*
@@ -195,12 +413,12 @@ xfs_ioc_trim(
for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) {
error = xfs_trim_extents(pag, start, end, minlen,
&blocks_trimmed);
- if (error) {
+ if (error)
last_error = error;
- if (error == -ERESTARTSYS) {
- xfs_perag_rele(pag);
- break;
- }
+
+ if (xfs_trim_should_stop()) {
+ xfs_perag_rele(pag);
+ break;
}
}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
index de92d9cc958f..2b1a85223a56 100644
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@@ -3,8 +3,10 @@
#define XFS_DISCARD_H 1
struct fstrim_range;
-struct list_head;
+struct xfs_mount;
+struct xfs_busy_extents;
-extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy);
+int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim);
#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7f071757f278..ac6ba646624d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1386,7 +1386,7 @@ xfs_qm_dqiterate(
return error;
error = iter_fn(dq, type, priv);
- id = dq->q_id;
+ id = dq->q_id + 1;
xfs_qm_dqput(dq);
} while (error == 0 && id != 0);
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1064c2342876..7cd09c3a82cb 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -146,6 +146,20 @@ xfs_nfs_get_inode(
return ERR_PTR(error);
}
+ /*
+ * Reload the incore unlinked list to avoid failure in inodegc.
+ * Use an unlocked check here because unrecovered unlinked inodes
+ * should be somewhat rare.
+ */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked(ip);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_irele(ip);
+ return ERR_PTR(error);
+ }
+ }
+
if (VFS_I(ip)->i_generation != generation) {
xfs_irele(ip);
return ERR_PTR(-ESTALE);
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 7c2fdc71e42d..9ecfdcdc752f 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -19,13 +19,13 @@
#include "xfs_log.h"
#include "xfs_ag.h"
-void
-xfs_extent_busy_insert(
- struct xfs_trans *tp,
+static void
+xfs_extent_busy_insert_list(
struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
- unsigned int flags)
+ unsigned int flags,
+ struct list_head *busy_list)
{
struct xfs_extent_busy *new;
struct xfs_extent_busy *busyp;
@@ -40,7 +40,7 @@ xfs_extent_busy_insert(
new->flags = flags;
/* trace before insert to be able to see failed inserts */
- trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len);
+ trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
spin_lock(&pag->pagb_lock);
rbp = &pag->pagb_tree.rb_node;
@@ -62,10 +62,33 @@ xfs_extent_busy_insert(
rb_link_node(&new->rb_node, parent, rbp);
rb_insert_color(&new->rb_node, &pag->pagb_tree);
- list_add(&new->list, &tp->t_busy);
+ /* always process discard lists in fifo order */
+ list_add_tail(&new->list, busy_list);
spin_unlock(&pag->pagb_lock);
}
+void
+xfs_extent_busy_insert(
+ struct xfs_trans *tp,
+ struct xfs_perag *pag,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ unsigned int flags)
+{
+ xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
+}
+
+void
+xfs_extent_busy_insert_discard(
+ struct xfs_perag *pag,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct list_head *busy_list)
+{
+ xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
+ busy_list);
+}
+
/*
* Search for a busy extent within the range of the extent we are about to
* allocate. You need to be holding the busy extent tree lock when calling
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index c37bf87e6781..0639aab336f3 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -16,9 +16,6 @@ struct xfs_alloc_arg;
/*
* Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
* have been freed but whose transactions aren't committed to disk yet.
- *
- * Note that we use the transaction ID to record the transaction, not the
- * transaction structure itself. See xfs_extent_busy_insert() for details.
*/
struct xfs_extent_busy {
struct rb_node rb_node; /* ag by-bno indexed search tree */
@@ -31,11 +28,32 @@ struct xfs_extent_busy {
#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */
};
+/*
+ * List used to track groups of related busy extents all the way through
+ * to discard completion.
+ */
+struct xfs_busy_extents {
+ struct xfs_mount *mount;
+ struct list_head extent_list;
+ struct work_struct endio_work;
+
+ /*
+ * Owner is the object containing the struct xfs_busy_extents to free
+ * once the busy extents have been processed. If only the
+ * xfs_busy_extents object needs freeing, then point this at itself.
+ */
+ void *owner;
+};
+
void
xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag,
xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
void
+xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno,
+ xfs_extlen_t len, struct list_head *busy_list);
+
+void
xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
bool do_discard);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index f1a5ecf099aa..3fa8789820ad 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -660,6 +660,7 @@ xfs_efi_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_efd_log_item *efdp;
@@ -683,7 +684,8 @@ xfs_efi_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
if (error)
return error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4f502219ae4f..203700278ddb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1287,11 +1287,11 @@ xfs_file_llseek(
static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size,
+ unsigned int order,
bool write_fault,
pfn_t *pfn)
{
- return dax_iomap_fault(vmf, pe_size, pfn, NULL,
+ return dax_iomap_fault(vmf, order, pfn, NULL,
(write_fault && !vmf->cow_page) ?
&xfs_dax_write_iomap_ops :
&xfs_read_iomap_ops);
@@ -1300,7 +1300,7 @@ xfs_dax_fault(
static inline vm_fault_t
xfs_dax_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size,
+ unsigned int order,
bool write_fault,
pfn_t *pfn)
{
@@ -1322,14 +1322,14 @@ xfs_dax_fault(
static vm_fault_t
__xfs_filemap_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size,
+ unsigned int order,
bool write_fault)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
vm_fault_t ret;
- trace_xfs_filemap_fault(ip, pe_size, write_fault);
+ trace_xfs_filemap_fault(ip, order, write_fault);
if (write_fault) {
sb_start_pagefault(inode->i_sb);
@@ -1340,9 +1340,9 @@ __xfs_filemap_fault(
pfn_t pfn;
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
+ ret = xfs_dax_fault(vmf, order, write_fault, &pfn);
if (ret & VM_FAULT_NEEDDSYNC)
- ret = dax_finish_sync_fault(vmf, pe_size, pfn);
+ ret = dax_finish_sync_fault(vmf, order, pfn);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
if (write_fault) {
@@ -1373,7 +1373,7 @@ xfs_filemap_fault(
struct vm_fault *vmf)
{
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
+ return __xfs_filemap_fault(vmf, 0,
IS_DAX(file_inode(vmf->vma->vm_file)) &&
xfs_is_write_fault(vmf));
}
@@ -1381,13 +1381,13 @@ xfs_filemap_fault(
static vm_fault_t
xfs_filemap_huge_fault(
struct vm_fault *vmf,
- enum page_entry_size pe_size)
+ unsigned int order)
{
if (!IS_DAX(file_inode(vmf->vma->vm_file)))
return VM_FAULT_FALLBACK;
/* DAX can shortcut the normal fault path on write faults! */
- return __xfs_filemap_fault(vmf, pe_size,
+ return __xfs_filemap_fault(vmf, order,
xfs_is_write_fault(vmf));
}
@@ -1395,7 +1395,7 @@ static vm_fault_t
xfs_filemap_page_mkwrite(
struct vm_fault *vmf)
{
- return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
+ return __xfs_filemap_fault(vmf, 0, true);
}
/*
@@ -1408,7 +1408,7 @@ xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
- return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
+ return __xfs_filemap_fault(vmf, 0, true);
}
static const struct vm_operations_struct xfs_file_vm_ops = {
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 10403ba9b58f..736e5545f584 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -565,6 +565,19 @@ err:
}
#endif /* CONFIG_XFS_RT */
+static inline bool
+rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r)
+{
+ if (!xfs_has_reflink(mp))
+ return true;
+ if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner))
+ return true;
+ if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN))
+ return true;
+ return false;
+}
+
/* Execute a getfsmap query against the regular data device. */
STATIC int
__xfs_getfsmap_datadev(
@@ -598,7 +611,6 @@ __xfs_getfsmap_datadev(
* low to the fsmap low key and max out the high key to the end
* of the AG.
*/
- info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
if (error)
@@ -608,12 +620,9 @@ __xfs_getfsmap_datadev(
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {
- /* empty */
- } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) ||
- (info->low.rm_flags & (XFS_RMAP_ATTR_FORK |
- XFS_RMAP_BMBT_BLOCK |
- XFS_RMAP_UNWRITTEN))) {
- info->low.rm_startblock += info->low.rm_blockcount;
+ /* No previous record from which to continue */
+ } else if (rmap_not_shareable(mp, &info->low)) {
+ /* Last record seen was an unshareable extent */
info->low.rm_owner = 0;
info->low.rm_offset = 0;
@@ -621,8 +630,10 @@ __xfs_getfsmap_datadev(
if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs)
return 0;
} else {
+ /* Last record seen was a shareable file data extent */
info->low.rm_offset += info->low.rm_blockcount;
}
+ info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
info->high.rm_startblock = -1U;
info->high.rm_owner = ULLONG_MAX;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 453890942d9f..3c210ac83713 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -113,7 +113,7 @@ xfs_inode_alloc(
INIT_LIST_HEAD(&ip->i_ioend_list);
spin_lock_init(&ip->i_ioend_lock);
ip->i_next_unlinked = NULLAGINO;
- ip->i_prev_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = 0;
return ip;
}
@@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
int cpu;
bool ret = false;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
@@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
int error = 0;
flush_workqueue(mp->m_inodegc_wq);
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
struct xfs_inodegc *gc;
gc = per_cpu_ptr(mp->m_inodegc, cpu);
@@ -803,44 +803,6 @@ out_error_or_again:
}
/*
- * "Is this a cached inode that's also allocated?"
- *
- * Look up an inode by number in the given file system. If the inode is
- * in cache and isn't in purgatory, return 1 if the inode is allocated
- * and 0 if it is not. For all other cases (not in cache, being torn
- * down, etc.), return a negative error code.
- *
- * The caller has to prevent inode allocation and freeing activity,
- * presumably by locking the AGI buffer. This is to ensure that an
- * inode cannot transition from allocated to freed until the caller is
- * ready to allow that. If the inode is in an intermediate state (new,
- * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
- * inode is not in the cache, -ENOENT will be returned. The caller must
- * deal with these scenarios appropriately.
- *
- * This is a specialized use case for the online scrubber; if you're
- * reading this, you probably want xfs_iget.
- */
-int
-xfs_icache_inode_is_allocated(
- struct xfs_mount *mp,
- struct xfs_trans *tp,
- xfs_ino_t ino,
- bool *inuse)
-{
- struct xfs_inode *ip;
- int error;
-
- error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
- if (error)
- return error;
-
- *inuse = !!(VFS_I(ip)->i_mode);
- xfs_irele(ip);
- return 0;
-}
-
-/*
* Grab the inode for reclaim exclusively.
*
* We have found this inode via a lookup under RCU, so the inode may have
@@ -1883,9 +1845,17 @@ xfs_inodegc_worker(
struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
+ struct xfs_mount *mp = gc->mp;
unsigned int nofs_flag;
- ASSERT(gc->cpu == smp_processor_id());
+ /*
+ * Clear the cpu mask bit and ensure that we have seen the latest
+ * update of the gc structure associated with this CPU. This matches
+ * with the release semantics used when setting the cpumask bit in
+ * xfs_inodegc_queue.
+ */
+ cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
+ smp_mb__after_atomic();
WRITE_ONCE(gc->items, 0);
@@ -1900,7 +1870,7 @@ xfs_inodegc_worker(
nofs_flag = memalloc_nofs_save();
ip = llist_entry(node, struct xfs_inode, i_gclist);
- trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+ trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
WRITE_ONCE(gc->shrinker_hits, 0);
llist_for_each_entry_safe(ip, n, node, i_gclist) {
@@ -2095,6 +2065,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
+ unsigned int cpu_nr;
unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
@@ -2102,18 +2073,28 @@ xfs_inodegc_queue(
ip->i_flags |= XFS_NEED_INACTIVE;
spin_unlock(&ip->i_flags_lock);
- gc = get_cpu_ptr(mp->m_inodegc);
+ cpu_nr = get_cpu();
+ gc = this_cpu_ptr(mp->m_inodegc);
llist_add(&ip->i_gclist, &gc->list);
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
/*
+ * Ensure the list add is always seen by anyone who finds the cpumask
+ * bit set. This effectively gives the cpumask bit set operation
+ * release ordering semantics.
+ */
+ smp_mb__before_atomic();
+ if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
+ cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
+
+ /*
* We queue the work while holding the current CPU so that the work
* is scheduled to run on this CPU.
*/
if (!xfs_is_inodegc_enabled(mp)) {
- put_cpu_ptr(gc);
+ put_cpu();
return;
}
@@ -2123,7 +2104,7 @@ xfs_inodegc_queue(
trace_xfs_inodegc_queue(mp, __return_address);
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
queue_delay);
- put_cpu_ptr(gc);
+ put_cpu();
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
@@ -2132,47 +2113,6 @@ xfs_inodegc_queue(
}
/*
- * Fold the dead CPU inodegc queue into the current CPUs queue.
- */
-void
-xfs_inodegc_cpu_dead(
- struct xfs_mount *mp,
- unsigned int dead_cpu)
-{
- struct xfs_inodegc *dead_gc, *gc;
- struct llist_node *first, *last;
- unsigned int count = 0;
-
- dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
- cancel_delayed_work_sync(&dead_gc->work);
-
- if (llist_empty(&dead_gc->list))
- return;
-
- first = dead_gc->list.first;
- last = first;
- while (last->next) {
- last = last->next;
- count++;
- }
- dead_gc->list.first = NULL;
- dead_gc->items = 0;
-
- /* Add pending work to current CPU */
- gc = get_cpu_ptr(mp->m_inodegc);
- llist_add_batch(first, last, &gc->list);
- count += READ_ONCE(gc->items);
- WRITE_ONCE(gc->items, count);
-
- if (xfs_is_inodegc_enabled(mp)) {
- trace_xfs_inodegc_queue(mp, __return_address);
- mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
- 0);
- }
- put_cpu_ptr(gc);
-}
-
-/*
* We set the inode flag atomically with the radix tree tag. Once we get tag
* lookups on the radix tree, this inode flag can go away.
*
@@ -2233,7 +2173,7 @@ xfs_inodegc_shrinker_count(
if (!xfs_is_inodegc_enabled(mp))
return 0;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
return XFS_INODEGC_SHRINKER_COUNT;
@@ -2258,7 +2198,7 @@ xfs_inodegc_shrinker_scan(
trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
unsigned int h = READ_ONCE(gc->shrinker_hits);
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 1dcdcb23796e..905944dafbe5 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -71,10 +71,6 @@ void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
void xfs_blockgc_worker(struct work_struct *work);
-
-int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_ino_t ino, bool *inuse);
-
void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp);
@@ -83,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
int xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
-void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9e62cc500140..4d55f58d99b7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -843,10 +843,9 @@ xfs_init_new_inode(
ip->i_df.if_nextents = 0;
ASSERT(ip->i_nblocks == 0);
- tv = current_time(inode);
+ tv = inode_set_ctime_current(inode);
inode->i_mtime = tv;
inode->i_atime = tv;
- inode->i_ctime = tv;
ip->i_extsize = 0;
ip->i_diflags = 0;
@@ -1643,8 +1642,11 @@ xfs_inode_needs_inactive(
if (VFS_I(ip)->i_mode == 0)
return false;
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (xfs_is_readonly(mp))
+ /*
+ * If this is a read-only mount, don't do this (would generate I/O)
+ * unless we're in log recovery and cleaning the iunlinked list.
+ */
+ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
return false;
/* If the log isn't running, push inodes straight to reclaim. */
@@ -1704,8 +1706,11 @@ xfs_inactive(
mp = ip->i_mount;
ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
- /* If this is a read-only mount, don't do this (would generate I/O) */
- if (xfs_is_readonly(mp))
+ /*
+ * If this is a read-only mount, don't do this (would generate I/O)
+ * unless we're in log recovery and cleaning the iunlinked list.
+ */
+ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
goto out;
/* Metadata inodes require explicit resource cleanup. */
@@ -1737,9 +1742,21 @@ xfs_inactive(
ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
- error = xfs_qm_dqattach(ip);
- if (error)
- goto out;
+ if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
+ /*
+ * If this inode is being inactivated during a quotacheck and
+ * has not yet been scanned by quotacheck, we /must/ remove
+ * the dquots from the inode before inactivation changes the
+ * block and inode counts. Most probably this is a result of
+ * reloading the incore iunlinked list to purge unrecovered
+ * unlinked inodes.
+ */
+ xfs_qm_dqdetach(ip);
+ } else {
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ goto out;
+ }
if (S_ISLNK(VFS_I(ip)->i_mode))
error = xfs_inactive_symlink(ip);
@@ -1823,12 +1840,17 @@ xfs_iunlink_lookup(
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+ if (!ip) {
+ /* Caller can handle inode not being in memory. */
+ rcu_read_unlock();
+ return NULL;
+ }
/*
- * Inode not in memory or in RCU freeing limbo should not happen.
- * Warn about this and let the caller handle the failure.
+ * Inode in RCU freeing limbo should not happen. Warn about this and
+ * let the caller handle the failure.
*/
- if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
+ if (WARN_ON_ONCE(!ip->i_ino)) {
rcu_read_unlock();
return NULL;
}
@@ -1837,7 +1859,10 @@ xfs_iunlink_lookup(
return ip;
}
-/* Update the prev pointer of the next agino. */
+/*
+ * Update the prev pointer of the next agino. Returns -ENOLINK if the inode
+ * is not in cache.
+ */
static int
xfs_iunlink_update_backref(
struct xfs_perag *pag,
@@ -1852,7 +1877,8 @@ xfs_iunlink_update_backref(
ip = xfs_iunlink_lookup(pag, next_agino);
if (!ip)
- return -EFSCORRUPTED;
+ return -ENOLINK;
+
ip->i_prev_unlinked = prev_agino;
return 0;
}
@@ -1896,6 +1922,64 @@ xfs_iunlink_update_bucket(
return 0;
}
+/*
+ * Load the inode @next_agino into the cache and set its prev_unlinked pointer
+ * to @prev_agino. Caller must hold the AGI to synchronize with other changes
+ * to the unlinked list.
+ */
+STATIC int
+xfs_iunlink_reload_next(
+ struct xfs_trans *tp,
+ struct xfs_buf *agibp,
+ xfs_agino_t prev_agino,
+ xfs_agino_t next_agino)
+{
+ struct xfs_perag *pag = agibp->b_pag;
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_inode *next_ip = NULL;
+ xfs_ino_t ino;
+ int error;
+
+ ASSERT(next_agino != NULLAGINO);
+
+#ifdef DEBUG
+ rcu_read_lock();
+ next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
+ ASSERT(next_ip == NULL);
+ rcu_read_unlock();
+#endif
+
+ xfs_info_ratelimited(mp,
+ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.",
+ next_agino, pag->pag_agno);
+
+ /*
+ * Use an untrusted lookup just to be cautious in case the AGI has been
+ * corrupted and now points at a free inode. That shouldn't happen,
+ * but we'd rather shut down now since we're already running in a weird
+ * situation.
+ */
+ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
+ error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
+ if (error)
+ return error;
+
+ /* If this is not an unlinked inode, something is very wrong. */
+ if (VFS_I(next_ip)->i_nlink != 0) {
+ error = -EFSCORRUPTED;
+ goto rele;
+ }
+
+ next_ip->i_prev_unlinked = prev_agino;
+ trace_xfs_iunlink_reload_next(next_ip);
+rele:
+ ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
+ if (xfs_is_quotacheck_running(mp) && next_ip)
+ xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
+ xfs_irele(next_ip);
+ return error;
+}
+
static int
xfs_iunlink_insert_inode(
struct xfs_trans *tp,
@@ -1927,6 +2011,8 @@ xfs_iunlink_insert_inode(
* inode.
*/
error = xfs_iunlink_update_backref(pag, agino, next_agino);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
if (error)
return error;
@@ -1942,6 +2028,7 @@ xfs_iunlink_insert_inode(
}
/* Point the head of the list to point to this inode. */
+ ip->i_prev_unlinked = NULLAGINO;
return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
}
@@ -2021,6 +2108,9 @@ xfs_iunlink_remove_inode(
*/
error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
ip->i_next_unlinked);
+ if (error == -ENOLINK)
+ error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
+ ip->i_next_unlinked);
if (error)
return error;
@@ -2041,7 +2131,7 @@ xfs_iunlink_remove_inode(
}
ip->i_next_unlinked = NULLAGINO;
- ip->i_prev_unlinked = NULLAGINO;
+ ip->i_prev_unlinked = 0;
return error;
}
@@ -3530,3 +3620,117 @@ xfs_iunlock2_io_mmap(
if (ip1 != ip2)
inode_unlock(VFS_I(ip1));
}
+
+/*
+ * Reload the incore inode list for this inode. Caller should ensure that
+ * the link count cannot change, either by taking ILOCK_SHARED or otherwise
+ * preventing other threads from executing.
+ */
+int
+xfs_inode_reload_unlinked_bucket(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_buf *agibp;
+ struct xfs_agi *agi;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
+ xfs_agino_t prev_agino, next_agino;
+ unsigned int bucket;
+ bool foundit = false;
+ int error;
+
+ /* Grab the first inode in the list */
+ pag = xfs_perag_get(mp, agno);
+ error = xfs_ialloc_read_agi(pag, tp, &agibp);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ /*
+ * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
+ * incore unlinked list pointers for this inode. Check once more to
+ * see if we raced with anyone else to reload the unlinked list.
+ */
+ if (!xfs_inode_unlinked_incomplete(ip)) {
+ foundit = true;
+ goto out_agibp;
+ }
+
+ bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+ agi = agibp->b_addr;
+
+ trace_xfs_inode_reload_unlinked_bucket(ip);
+
+ xfs_info_ratelimited(mp,
+ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.",
+ agino, agno);
+
+ prev_agino = NULLAGINO;
+ next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
+ while (next_agino != NULLAGINO) {
+ struct xfs_inode *next_ip = NULL;
+
+ /* Found this caller's inode, set its backlink. */
+ if (next_agino == agino) {
+ next_ip = ip;
+ next_ip->i_prev_unlinked = prev_agino;
+ foundit = true;
+ goto next_inode;
+ }
+
+ /* Try in-memory lookup first. */
+ next_ip = xfs_iunlink_lookup(pag, next_agino);
+ if (next_ip)
+ goto next_inode;
+
+ /* Inode not in memory, try reloading it. */
+ error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
+ next_agino);
+ if (error)
+ break;
+
+ /* Grab the reloaded inode. */
+ next_ip = xfs_iunlink_lookup(pag, next_agino);
+ if (!next_ip) {
+ /* No incore inode at all? We reloaded it... */
+ ASSERT(next_ip != NULL);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+next_inode:
+ prev_agino = next_agino;
+ next_agino = next_ip->i_next_unlinked;
+ }
+
+out_agibp:
+ xfs_trans_brelse(tp, agibp);
+ /* Should have found this inode somewhere in the iunlinked bucket. */
+ if (!error && !foundit)
+ error = -EFSCORRUPTED;
+ return error;
+}
+
+/* Decide if this inode is missing its unlinked list and reload it. */
+int
+xfs_inode_reload_unlinked(
+ struct xfs_inode *ip)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_inode_unlinked_incomplete(ip))
+ error = xfs_inode_reload_unlinked_bucket(tp, ip);
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_trans_cancel(tp);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7547caf2f2ab..0c5bdb91152e 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -68,8 +68,21 @@ typedef struct xfs_inode {
uint64_t i_diflags2; /* XFS_DIFLAG2_... */
struct timespec64 i_crtime; /* time created */
- /* unlinked list pointers */
+ /*
+ * Unlinked list pointers. These point to the next and previous inodes
+ * in the AGI unlinked bucket list, respectively. These fields can
+ * only be updated with the AGI locked.
+ *
+ * i_next_unlinked caches di_next_unlinked.
+ */
xfs_agino_t i_next_unlinked;
+
+ /*
+ * If the inode is not on an unlinked list, this field is zero. If the
+ * inode is the first element in an unlinked list, this field is
+ * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode
+ * in the unlinked list.
+ */
xfs_agino_t i_prev_unlinked;
/* VFS inode */
@@ -81,6 +94,11 @@ typedef struct xfs_inode {
struct list_head i_ioend_list;
} xfs_inode_t;
+static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip)
+{
+ return ip->i_prev_unlinked != 0;
+}
+
static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
{
return ip->i_forkoff > 0;
@@ -326,6 +344,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
*/
#define XFS_INACTIVATING (1 << 13)
+/* Quotacheck is running but inode has not been added to quota counts. */
+#define XFS_IQUOTAUNCHECKED (1 << 14)
+
/* All inode state flags related to inode reclaim. */
#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \
XFS_IRECLAIM | \
@@ -340,7 +361,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
- XFS_INACTIVATING)
+ XFS_INACTIVATING | XFS_IQUOTAUNCHECKED)
/*
* Flags for inode locking.
@@ -575,4 +596,13 @@ void xfs_end_io(struct work_struct *work);
int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+static inline bool
+xfs_inode_unlinked_incomplete(
+ struct xfs_inode *ip)
+{
+ return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip);
+}
+int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
+int xfs_inode_reload_unlinked(struct xfs_inode *ip);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 91c847a84e10..127b2410eb20 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -528,7 +528,7 @@ xfs_inode_to_log_dinode(
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
- to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
+ to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode));
to->di_nlink = inode->i_nlink;
to->di_gen = inode->i_generation;
to->di_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 24718adb3c16..2b3b05c28e9e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -574,7 +574,7 @@ xfs_vn_getattr(
stat->ino = ip->i_ino;
stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime;
- stat->ctime = inode->i_ctime;
+ stat->ctime = inode_get_ctime(inode);
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
if (xfs_has_v3inodes(mp)) {
@@ -584,6 +584,11 @@ xfs_vn_getattr(
}
}
+ if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
+ stat->change_cookie = inode_query_iversion(inode);
+ stat->result_mask |= STATX_CHANGE_COOKIE;
+ }
+
/*
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
@@ -1029,7 +1034,6 @@ xfs_vn_setattr(
STATIC int
xfs_vn_update_time(
struct inode *inode,
- struct timespec64 *now,
int flags)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -1037,13 +1041,16 @@ xfs_vn_update_time(
int log_flags = XFS_ILOG_TIMESTAMP;
struct xfs_trans *tp;
int error;
+ struct timespec64 now;
trace_xfs_update_time(ip);
if (inode->i_sb->s_flags & SB_LAZYTIME) {
if (!((flags & S_VERSION) &&
- inode_maybe_inc_iversion(inode, false)))
- return generic_update_time(inode, now, flags);
+ inode_maybe_inc_iversion(inode, false))) {
+ generic_update_time(inode, flags);
+ return 0;
+ }
/* Capture the iversion update that just occurred */
log_flags |= XFS_ILOG_CORE;
@@ -1054,12 +1061,15 @@ xfs_vn_update_time(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (flags & S_CTIME)
- inode->i_ctime = *now;
+ if (flags & (S_CTIME|S_MTIME))
+ now = inode_set_ctime_current(inode);
+ else
+ now = current_time(inode);
+
if (flags & S_MTIME)
- inode->i_mtime = *now;
+ inode->i_mtime = now;
if (flags & S_ATIME)
- inode->i_atime = *now;
+ inode->i_atime = now;
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, log_flags);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f225413a993c..f5377ba5967a 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -80,6 +80,17 @@ xfs_bulkstat_one_int(
if (error)
goto out;
+ /* Reload the incore unlinked list to avoid failure in inodegc. */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked_bucket(tp, ip);
+ if (error) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ xfs_irele(ip);
+ return error;
+ }
+ }
+
ASSERT(ip != NULL);
ASSERT(ip->i_imap.im_blkno != 0);
inode = VFS_I(ip);
@@ -100,8 +111,8 @@ xfs_bulkstat_one_int(
buf->bs_atime_nsec = inode->i_atime.tv_nsec;
buf->bs_mtime = inode->i_mtime.tv_sec;
buf->bs_mtime_nsec = inode->i_mtime.tv_nsec;
- buf->bs_ctime = inode->i_ctime.tv_sec;
- buf->bs_ctime_nsec = inode->i_ctime.tv_nsec;
+ buf->bs_ctime = inode_get_ctime(inode).tv_sec;
+ buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec;
buf->bs_gen = inode->i_generation;
buf->bs_mode = inode->i_mode;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 74dcb05069e8..e9d317a3dafe 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -63,6 +63,7 @@ typedef __u32 xfs_nlink_t;
#include <linux/rhashtable.h>
#include <linux/xattr.h>
#include <linux/mnt_idmapping.h>
+#include <linux/debugfs.h>
#include <asm/page.h>
#include <asm/div64.h>
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 79004d193e54..51c100c86177 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -715,15 +715,7 @@ xfs_log_mount(
* just worked.
*/
if (!xfs_has_norecovery(mp)) {
- /*
- * log recovery ignores readonly state and so we need to clear
- * mount-based read only state so it can write to disk.
- */
- bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
- &mp->m_opstate);
error = xlog_recover(log);
- if (readonly)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
if (error) {
xfs_warn(mp, "log mount/recovery failed: error %d",
error);
@@ -772,7 +764,6 @@ xfs_log_mount_finish(
struct xfs_mount *mp)
{
struct xlog *log = mp->m_log;
- bool readonly;
int error = 0;
if (xfs_has_norecovery(mp)) {
@@ -781,12 +772,6 @@ xfs_log_mount_finish(
}
/*
- * log recovery ignores readonly state and so we need to clear
- * mount-based read only state so it can write to disk.
- */
- readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
-
- /*
* During the second phase of log recovery, we need iget and
* iput to behave like they do for an active filesystem.
* xfs_fs_drop_inode needs to be able to prevent the deletion
@@ -835,8 +820,6 @@ xfs_log_mount_finish(
xfs_buftarg_drain(mp->m_ddev_targp);
clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
- if (readonly)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
/* Make sure the log is dead if we're returning failure. */
ASSERT(!error || xlog_is_shutdown(log));
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index eccbfb99e894..67a99d94701e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -16,8 +16,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_trace.h"
-
-struct workqueue_struct *xfs_discard_wq;
+#include "xfs_discard.h"
/*
* Allocate a new ticket. Failing to get a new ticket makes it really hard to
@@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void)
ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
INIT_LIST_HEAD(&ctx->committing);
- INIT_LIST_HEAD(&ctx->busy_extents);
+ INIT_LIST_HEAD(&ctx->busy_extents.extent_list);
INIT_LIST_HEAD(&ctx->log_items);
INIT_LIST_HEAD(&ctx->lv_chain);
INIT_WORK(&ctx->push_work, xlog_cil_push_work);
@@ -124,7 +123,7 @@ xlog_cil_push_pcp_aggregate(
struct xlog_cil_pcp *cilpcp;
int cpu;
- for_each_online_cpu(cpu) {
+ for_each_cpu(cpu, &ctx->cil_pcpmask) {
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
ctx->ticket->t_curr_res += cilpcp->space_reserved;
@@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate(
if (!list_empty(&cilpcp->busy_extents)) {
list_splice_init(&cilpcp->busy_extents,
- &ctx->busy_extents);
+ &ctx->busy_extents.extent_list);
}
if (!list_empty(&cilpcp->log_items))
list_splice_init(&cilpcp->log_items, &ctx->log_items);
@@ -165,7 +164,13 @@ xlog_cil_insert_pcp_aggregate(
if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
return;
- for_each_online_cpu(cpu) {
+ /*
+ * We can race with other cpus setting cil_pcpmask. However, we've
+ * atomically cleared PCP_SPACE which forces other threads to add to
+ * the global space used count. cil_pcpmask is a superset of cilpcp
+ * structures that could have a nonzero space_used.
+ */
+ for_each_cpu(cpu, &ctx->cil_pcpmask) {
int old, prev;
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
@@ -554,6 +559,7 @@ xlog_cil_insert_items(
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
int space_used;
int order;
+ unsigned int cpu_nr;
struct xlog_cil_pcp *cilpcp;
ASSERT(tp);
@@ -577,7 +583,12 @@ xlog_cil_insert_items(
* can't be scheduled away between split sample/update operations that
* are done without outside locking to serialise them.
*/
- cilpcp = get_cpu_ptr(cil->xc_pcp);
+ cpu_nr = get_cpu();
+ cilpcp = this_cpu_ptr(cil->xc_pcp);
+
+ /* Tell the future push that there was work added by this CPU. */
+ if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask))
+ cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask);
/*
* We need to take the CIL checkpoint unit reservation on the first
@@ -663,7 +674,7 @@ xlog_cil_insert_items(
continue;
list_add_tail(&lip->li_cil, &cilpcp->log_items);
}
- put_cpu_ptr(cilpcp);
+ put_cpu();
/*
* If we've overrun the reservation, dump the tx details before we move
@@ -696,76 +707,6 @@ xlog_cil_free_logvec(
}
}
-static void
-xlog_discard_endio_work(
- struct work_struct *work)
-{
- struct xfs_cil_ctx *ctx =
- container_of(work, struct xfs_cil_ctx, discard_endio_work);
- struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
-
- xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
- kmem_free(ctx);
-}
-
-/*
- * Queue up the actual completion to a thread to avoid IRQ-safe locking for
- * pagb_lock. Note that we need a unbounded workqueue, otherwise we might
- * get the execution delayed up to 30 seconds for weird reasons.
- */
-static void
-xlog_discard_endio(
- struct bio *bio)
-{
- struct xfs_cil_ctx *ctx = bio->bi_private;
-
- INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
- queue_work(xfs_discard_wq, &ctx->discard_endio_work);
- bio_put(bio);
-}
-
-static void
-xlog_discard_busy_extents(
- struct xfs_mount *mp,
- struct xfs_cil_ctx *ctx)
-{
- struct list_head *list = &ctx->busy_extents;
- struct xfs_extent_busy *busyp;
- struct bio *bio = NULL;
- struct blk_plug plug;
- int error = 0;
-
- ASSERT(xfs_has_discard(mp));
-
- blk_start_plug(&plug);
- list_for_each_entry(busyp, list, list) {
- trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
- busyp->length);
-
- error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
- XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
- XFS_FSB_TO_BB(mp, busyp->length),
- GFP_NOFS, &bio);
- if (error && error != -EOPNOTSUPP) {
- xfs_info(mp,
- "discard failed for extent [0x%llx,%u], error %d",
- (unsigned long long)busyp->bno,
- busyp->length,
- error);
- break;
- }
- }
-
- if (bio) {
- bio->bi_private = ctx;
- bio->bi_end_io = xlog_discard_endio;
- submit_bio(bio);
- } else {
- xlog_discard_endio_work(&ctx->discard_endio_work);
- }
- blk_finish_plug(&plug);
-}
-
/*
* Mark all items committed and clear busy extents. We free the log vector
* chains in a separate pass so that we unpin the log items as quickly as
@@ -795,8 +736,8 @@ xlog_cil_committed(
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
ctx->start_lsn, abort);
- xfs_extent_busy_sort(&ctx->busy_extents);
- xfs_extent_busy_clear(mp, &ctx->busy_extents,
+ xfs_extent_busy_sort(&ctx->busy_extents.extent_list);
+ xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list,
xfs_has_discard(mp) && !abort);
spin_lock(&ctx->cil->xc_push_lock);
@@ -805,10 +746,14 @@ xlog_cil_committed(
xlog_cil_free_logvec(&ctx->lv_chain);
- if (!list_empty(&ctx->busy_extents))
- xlog_discard_busy_extents(mp, ctx);
- else
- kmem_free(ctx);
+ if (!list_empty(&ctx->busy_extents.extent_list)) {
+ ctx->busy_extents.mount = mp;
+ ctx->busy_extents.owner = ctx;
+ xfs_discard_extents(mp, &ctx->busy_extents);
+ return;
+ }
+
+ kmem_free(ctx);
}
void
@@ -1791,38 +1736,6 @@ out_shutdown:
}
/*
- * Move dead percpu state to the relevant CIL context structures.
- *
- * We have to lock the CIL context here to ensure that nothing is modifying
- * the percpu state, either addition or removal. Both of these are done under
- * the CIL context lock, so grabbing that exclusively here will ensure we can
- * safely drain the cilpcp for the CPU that is dying.
- */
-void
-xlog_cil_pcp_dead(
- struct xlog *log,
- unsigned int cpu)
-{
- struct xfs_cil *cil = log->l_cilp;
- struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
- struct xfs_cil_ctx *ctx;
-
- down_write(&cil->xc_ctx_lock);
- ctx = cil->xc_ctx;
- if (ctx->ticket)
- ctx->ticket->t_curr_res += cilpcp->space_reserved;
- cilpcp->space_reserved = 0;
-
- if (!list_empty(&cilpcp->log_items))
- list_splice_init(&cilpcp->log_items, &ctx->log_items);
- if (!list_empty(&cilpcp->busy_extents))
- list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
- atomic_add(cilpcp->space_used, &ctx->space_used);
- cilpcp->space_used = 0;
- up_write(&cil->xc_ctx_lock);
-}
-
-/*
* Perform initial CIL structure initialisation.
*/
int
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1bd2963e8fbd..fa3ad1d7b31c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -6,6 +6,8 @@
#ifndef __XFS_LOG_PRIV_H__
#define __XFS_LOG_PRIV_H__
+#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */
+
struct xfs_buf;
struct xlog;
struct xlog_ticket;
@@ -223,14 +225,19 @@ struct xfs_cil_ctx {
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
atomic_t space_used; /* aggregate size of regions */
- struct list_head busy_extents; /* busy extents in chkpt */
+ struct xfs_busy_extents busy_extents;
struct list_head log_items; /* log items in chkpt */
struct list_head lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
- struct work_struct discard_endio_work;
struct work_struct push_work;
atomic_t order_id;
+
+ /*
+ * CPUs that could have added items to the percpu CIL data. Access is
+ * coordinated with xc_ctx_lock.
+ */
+ struct cpumask cil_pcpmask;
};
/*
@@ -278,9 +285,6 @@ struct xfs_cil {
wait_queue_head_t xc_push_wait; /* background push throttle */
void __percpu *xc_pcp; /* percpu CIL structures */
-#ifdef CONFIG_HOTPLUG_CPU
- struct list_head xc_pcp_list;
-#endif
} ____cacheline_aligned_in_smp;
/* xc_flags bit values */
@@ -705,9 +709,4 @@ xlog_kvmalloc(
return p;
}
-/*
- * CIL CPU dead notifier
- */
-void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
-
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 82c81d20459d..13b94d2e605b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -329,7 +329,7 @@ xlog_find_verify_cycle(
* try a smaller size. We need to be able to read at least
* a log sector, or we're out of luck.
*/
- bufblks = 1 << ffs(nbblks);
+ bufblks = roundup_pow_of_two(nbblks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
@@ -1528,7 +1528,7 @@ xlog_write_log_records(
* a smaller size. We need to be able to write at least a
* log sector, or we're out of luck.
*/
- bufblks = 1 << ffs(blocks);
+ bufblks = roundup_pow_of_two(blocks);
while (bufblks > log->l_logBBsize)
bufblks >>= 1;
while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index fb87ffb48f7f..0a0fd19573d8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -34,6 +34,7 @@
#include "xfs_health.h"
#include "xfs_trace.h"
#include "xfs_ag.h"
+#include "scrub/stats.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -716,9 +717,11 @@ xfs_mountfs(
if (error)
goto out_remove_sysfs;
+ xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
+
error = xfs_error_sysfs_init(mp);
if (error)
- goto out_del_stats;
+ goto out_remove_scrub_stats;
error = xfs_errortag_init(mp);
if (error)
@@ -1033,7 +1036,8 @@ xfs_mountfs(
xfs_errortag_del(mp);
out_remove_error_sysfs:
xfs_error_sysfs_del(mp);
- out_del_stats:
+ out_remove_scrub_stats:
+ xchk_stats_unregister(mp->m_scrub_stats);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
@@ -1105,6 +1109,7 @@ xfs_unmountfs(
xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
+ xchk_stats_unregister(mp->m_scrub_stats);
xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index e2866e7fa60c..d19cca099bc3 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -60,6 +60,7 @@ struct xfs_error_cfg {
* Per-cpu deferred inode inactivation GC lists.
*/
struct xfs_inodegc {
+ struct xfs_mount *mp;
struct llist_head list;
struct delayed_work work;
int error;
@@ -67,9 +68,7 @@ struct xfs_inodegc {
/* approximate count of inodes in the list */
unsigned int items;
unsigned int shrinker_hits;
-#if defined(DEBUG) || defined(XFS_WARN)
unsigned int cpu;
-#endif
};
/*
@@ -98,7 +97,6 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
- struct list_head m_mount_list; /* global mount list */
void __percpu *m_inodegc; /* percpu inodegc structures */
/*
@@ -208,11 +206,15 @@ typedef struct xfs_mount {
uint64_t m_resblks_avail;/* available reserved blocks */
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
+ struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
+#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
+ struct xchk_stats *m_scrub_stats;
+#endif
xfs_agnumber_t m_agfrotor; /* last ag where space found */
atomic_t m_agirotor; /* last ag dir inode alloced */
@@ -245,6 +247,9 @@ typedef struct xfs_mount {
unsigned int *m_errortag;
struct xfs_kobj m_errortag_kobj;
#endif
+
+ /* cpus that have inodes queued for inactivation */
+ struct cpumask m_inodegc_cpumask;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)
@@ -400,6 +405,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_WARNED_SHRINK 8
/* Kernel has logged a warning about logged xattr updates being used. */
#define XFS_OPSTATE_WARNED_LARP 9
+/* Mount time quotacheck is running */
+#define XFS_OPSTATE_QUOTACHECK_RUNNING 10
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -422,6 +429,11 @@ __XFS_IS_OPSTATE(inode32, INODE32)
__XFS_IS_OPSTATE(readonly, READONLY)
__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
+#ifdef CONFIG_XFS_QUOTA
+__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
+#else
+# define xfs_is_quotacheck_running(mp) (false)
+#endif
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -439,7 +451,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
{ (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
- { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }
+ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
+ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }
/*
* Max and min values for mount-option defined I/O
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 4a9bbd3fe120..a7daa522e00f 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure(
struct xfs_rmap_irec ri_low = { };
struct xfs_rmap_irec ri_high;
struct xfs_agf *agf;
- xfs_agblock_t agend;
struct xfs_perag *pag;
+ xfs_agblock_t range_agend;
pag = xfs_perag_get(mp, agno);
error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
@@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure(
ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
agf = agf_bp->b_addr;
- agend = min(be32_to_cpu(agf->agf_length),
+ range_agend = min(be32_to_cpu(agf->agf_length) - 1,
ri_high.rm_startblock);
notify.startblock = ri_low.rm_startblock;
- notify.blockcount = agend - ri_low.rm_startblock;
+ notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
xfs_dax_failure_fn, &notify);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6abcc34fafd8..086e78a6143a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1160,6 +1160,19 @@ xfs_qm_dqusage_adjust(
if (error)
return error;
+ /*
+ * Reload the incore unlinked list to avoid failure in inodegc.
+ * Use an unlocked check here because unrecovered unlinked inodes
+ * should be somewhat rare.
+ */
+ if (xfs_inode_unlinked_incomplete(ip)) {
+ error = xfs_inode_reload_unlinked(ip);
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ goto error0;
+ }
+ }
+
ASSERT(ip->i_delayed_blks == 0);
if (XFS_IS_REALTIME_INODE(ip)) {
@@ -1173,6 +1186,7 @@ xfs_qm_dqusage_adjust(
}
nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
+ xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED);
/*
* Add the (disk blocks and inode) resources occupied by this
@@ -1319,8 +1333,10 @@ xfs_qm_quotacheck(
flags |= XFS_PQUOTA_CHKD;
}
+ xfs_set_quotacheck_running(mp);
error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
NULL);
+ xfs_clear_quotacheck_running(mp);
/*
* On error, the inode walk may have partially populated the dquot
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index edd8587658d5..2d4444d61e98 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -477,6 +477,7 @@ xfs_cui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
@@ -514,8 +515,9 @@ xfs_cui_item_recover(
* doesn't fit. We need to reserve enough blocks to handle a
* full btree split on either end of the refcount range.
*/
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 520c7ebdfed8..0e0e747028da 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -507,6 +507,7 @@ xfs_rui_item_recover(
struct xfs_log_item *lip,
struct list_head *capture_list)
{
+ struct xfs_trans_res resv;
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
@@ -530,8 +531,9 @@ xfs_rui_item_recover(
}
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp);
+ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
+ error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0,
+ XFS_TRANS_RESERVE, &tp);
if (error)
return error;
rudp = xfs_trans_get_rud(tp, ruip);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 818510243130..819a3568b28f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -42,6 +42,7 @@
#include "xfs_xattr.h"
#include "xfs_iunlink_item.h"
#include "xfs_dahash_test.h"
+#include "scrub/stats.h"
#include <linux/magic.h>
#include <linux/fs_context.h>
@@ -49,33 +50,12 @@
static const struct super_operations xfs_super_operations;
+static struct dentry *xfs_debugfs; /* top-level xfs debugfs dir */
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
-#ifdef CONFIG_HOTPLUG_CPU
-static LIST_HEAD(xfs_mount_list);
-static DEFINE_SPINLOCK(xfs_mount_list_lock);
-
-static inline void xfs_mount_list_add(struct xfs_mount *mp)
-{
- spin_lock(&xfs_mount_list_lock);
- list_add(&mp->m_mount_list, &xfs_mount_list);
- spin_unlock(&xfs_mount_list_lock);
-}
-
-static inline void xfs_mount_list_del(struct xfs_mount *mp)
-{
- spin_lock(&xfs_mount_list_lock);
- list_del(&mp->m_mount_list);
- spin_unlock(&xfs_mount_list_lock);
-}
-#else /* !CONFIG_HOTPLUG_CPU */
-static inline void xfs_mount_list_add(struct xfs_mount *mp) {}
-static inline void xfs_mount_list_del(struct xfs_mount *mp) {}
-#endif
-
enum xfs_dax_mode {
XFS_DAX_INODE = 0,
XFS_DAX_ALWAYS = 1,
@@ -377,17 +357,6 @@ disable_dax:
return 0;
}
-static void
-xfs_bdev_mark_dead(
- struct block_device *bdev)
-{
- xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED);
-}
-
-static const struct blk_holder_ops xfs_holder_ops = {
- .mark_dead = xfs_bdev_mark_dead,
-};
-
STATIC int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -396,8 +365,8 @@ xfs_blkdev_get(
{
int error = 0;
- *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp,
- &xfs_holder_ops);
+ *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*bdevp)) {
error = PTR_ERR(*bdevp);
xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
@@ -407,31 +376,45 @@ xfs_blkdev_get(
}
STATIC void
-xfs_blkdev_put(
- struct xfs_mount *mp,
- struct block_device *bdev)
-{
- if (bdev)
- blkdev_put(bdev, mp);
-}
-
-STATIC void
-xfs_close_devices(
+xfs_shutdown_devices(
struct xfs_mount *mp)
{
+ /*
+ * Udev is triggered whenever anyone closes a block device or unmounts
+ * a file systemm on a block device.
+ * The default udev rules invoke blkid to read the fs super and create
+ * symlinks to the bdev under /dev/disk. For this, it uses buffered
+ * reads through the page cache.
+ *
+ * xfs_db also uses buffered reads to examine metadata. There is no
+ * coordination between xfs_db and udev, which means that they can run
+ * concurrently. Note there is no coordination between the kernel and
+ * blkid either.
+ *
+ * On a system with 64k pages, the page cache can cache the superblock
+ * and the root inode (and hence the root directory) with the same 64k
+ * page. If udev spawns blkid after the mkfs and the system is busy
+ * enough that it is still running when xfs_db starts up, they'll both
+ * read from the same page in the pagecache.
+ *
+ * The unmount writes updated inode metadata to disk directly. The XFS
+ * buffer cache does not use the bdev pagecache, so it needs to
+ * invalidate that pagecache on unmount. If the above scenario occurs,
+ * the pagecache no longer reflects what's on disk, xfs_db reads the
+ * stale metadata, and fails to find /a. Most of the time this succeeds
+ * because closing a bdev invalidates the page cache, but when processes
+ * race, everyone loses.
+ */
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
- struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
-
- xfs_free_buftarg(mp->m_logdev_targp);
- xfs_blkdev_put(mp, logdev);
+ blkdev_issue_flush(mp->m_logdev_targp->bt_bdev);
+ invalidate_bdev(mp->m_logdev_targp->bt_bdev);
}
if (mp->m_rtdev_targp) {
- struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
-
- xfs_free_buftarg(mp->m_rtdev_targp);
- xfs_blkdev_put(mp, rtdev);
+ blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
+ invalidate_bdev(mp->m_rtdev_targp->bt_bdev);
}
- xfs_free_buftarg(mp->m_ddev_targp);
+ blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
+ invalidate_bdev(mp->m_ddev_targp->bt_bdev);
}
/*
@@ -448,17 +431,24 @@ STATIC int
xfs_open_devices(
struct xfs_mount *mp)
{
- struct block_device *ddev = mp->m_super->s_bdev;
+ struct super_block *sb = mp->m_super;
+ struct block_device *ddev = sb->s_bdev;
struct block_device *logdev = NULL, *rtdev = NULL;
int error;
/*
+ * blkdev_put() can't be called under s_umount, see the comment
+ * in get_tree_bdev() for more details
+ */
+ up_write(&sb->s_umount);
+
+ /*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
- return error;
+ goto out_relock;
}
if (mp->m_rtname) {
@@ -496,7 +486,10 @@ xfs_open_devices(
mp->m_logdev_targp = mp->m_ddev_targp;
}
- return 0;
+ error = 0;
+out_relock:
+ down_write(&sb->s_umount);
+ return error;
out_free_rtdev_targ:
if (mp->m_rtdev_targp)
@@ -504,11 +497,12 @@ xfs_open_devices(
out_free_ddev_targ:
xfs_free_buftarg(mp->m_ddev_targp);
out_close_rtdev:
- xfs_blkdev_put(mp, rtdev);
+ if (rtdev)
+ blkdev_put(rtdev, sb);
out_close_logdev:
if (logdev && logdev != ddev)
- xfs_blkdev_put(mp, logdev);
- return error;
+ blkdev_put(logdev, sb);
+ goto out_relock;
}
/*
@@ -758,6 +752,18 @@ static void
xfs_mount_free(
struct xfs_mount *mp)
{
+ /*
+ * Free the buftargs here because blkdev_put needs to be called outside
+ * of sb->s_umount, which is held around the call to ->put_super.
+ */
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
+ xfs_free_buftarg(mp->m_logdev_targp);
+ if (mp->m_rtdev_targp)
+ xfs_free_buftarg(mp->m_rtdev_targp);
+ if (mp->m_ddev_targp)
+ xfs_free_buftarg(mp->m_ddev_targp);
+
+ debugfs_remove(mp->m_debugfs);
kfree(mp->m_rtname);
kfree(mp->m_logname);
kmem_free(mp);
@@ -1107,9 +1113,8 @@ xfs_inodegc_init_percpu(
for_each_possible_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
-#if defined(DEBUG) || defined(XFS_WARN)
gc->cpu = cpu;
-#endif
+ gc->mp = mp;
init_llist_head(&gc->list);
gc->items = 0;
gc->error = 0;
@@ -1133,24 +1138,17 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
- /* if ->fill_super failed, we have no mount to tear down */
- if (!sb->s_fs_info)
- return;
-
xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
xfs_freesb(mp);
+ xchk_mount_stats_free(mp);
free_percpu(mp->m_stats.xs_stats);
- xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
- xfs_close_devices(mp);
-
- sb->s_fs_info = NULL;
- xfs_mount_free(mp);
+ xfs_shutdown_devices(mp);
}
static long
@@ -1479,6 +1477,21 @@ xfs_fs_validate_params(
return 0;
}
+struct dentry *
+xfs_debugfs_mkdir(
+ const char *name,
+ struct dentry *parent)
+{
+ struct dentry *child;
+
+ /* Apparently we're expected to ignore error returns?? */
+ child = debugfs_create_dir(name, parent);
+ if (IS_ERR(child))
+ return NULL;
+
+ return child;
+}
+
static int
xfs_fs_fill_super(
struct super_block *sb,
@@ -1492,7 +1505,7 @@ xfs_fs_fill_super(
error = xfs_fs_validate_params(mp);
if (error)
- goto out_free_names;
+ return error;
sb_min_blocksize(sb, BBSIZE);
sb->s_xattr = xfs_xattr_handlers;
@@ -1519,11 +1532,18 @@ xfs_fs_fill_super(
error = xfs_open_devices(mp);
if (error)
- goto out_free_names;
+ return error;
+
+ if (xfs_debugfs) {
+ mp->m_debugfs = xfs_debugfs_mkdir(mp->m_super->s_id,
+ xfs_debugfs);
+ } else {
+ mp->m_debugfs = NULL;
+ }
error = xfs_init_mount_workqueues(mp);
if (error)
- goto out_close_devices;
+ goto out_shutdown_devices;
error = xfs_init_percpu_counters(mp);
if (error)
@@ -1533,13 +1553,6 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_counters;
- /*
- * All percpu data structures requiring cleanup when a cpu goes offline
- * must be allocated before adding this @mp to the cpu-dead handler's
- * mount list.
- */
- xfs_mount_list_add(mp);
-
/* Allocate stats memory before we do operations that might use it */
mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
if (!mp->m_stats.xs_stats) {
@@ -1547,10 +1560,14 @@ xfs_fs_fill_super(
goto out_destroy_inodegc;
}
- error = xfs_readsb(mp, flags);
+ error = xchk_mount_stats_alloc(mp);
if (error)
goto out_free_stats;
+ error = xfs_readsb(mp, flags);
+ if (error)
+ goto out_free_scrub_stats;
+
error = xfs_finish_flags(mp);
if (error)
goto out_free_sb;
@@ -1728,20 +1745,18 @@ xfs_fs_fill_super(
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
+ out_free_scrub_stats:
+ xchk_mount_stats_free(mp);
out_free_stats:
free_percpu(mp->m_stats.xs_stats);
out_destroy_inodegc:
- xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
- out_close_devices:
- xfs_close_devices(mp);
- out_free_names:
- sb->s_fs_info = NULL;
- xfs_mount_free(mp);
+ out_shutdown_devices:
+ xfs_shutdown_devices(mp);
return error;
out_unmount:
@@ -1934,7 +1949,8 @@ xfs_fs_reconfigure(
return 0;
}
-static void xfs_fs_free(
+static void
+xfs_fs_free(
struct fs_context *fc)
{
struct xfs_mount *mp = fc->s_fs_info;
@@ -2003,12 +2019,20 @@ static int xfs_init_fs_context(
return 0;
}
+static void
+xfs_kill_sb(
+ struct super_block *sb)
+{
+ kill_block_super(sb);
+ xfs_mount_free(XFS_M(sb));
+}
+
static struct file_system_type xfs_fs_type = {
.owner = THIS_MODULE,
.name = "xfs",
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
- .kill_sb = kill_block_super,
+ .kill_sb = xfs_kill_sb,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("xfs");
@@ -2270,49 +2294,6 @@ xfs_destroy_workqueues(void)
destroy_workqueue(xfs_alloc_wq);
}
-#ifdef CONFIG_HOTPLUG_CPU
-static int
-xfs_cpu_dead(
- unsigned int cpu)
-{
- struct xfs_mount *mp, *n;
-
- spin_lock(&xfs_mount_list_lock);
- list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
- spin_unlock(&xfs_mount_list_lock);
- xfs_inodegc_cpu_dead(mp, cpu);
- xlog_cil_pcp_dead(mp->m_log, cpu);
- spin_lock(&xfs_mount_list_lock);
- }
- spin_unlock(&xfs_mount_list_lock);
- return 0;
-}
-
-static int __init
-xfs_cpu_hotplug_init(void)
-{
- int error;
-
- error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
- xfs_cpu_dead);
- if (error < 0)
- xfs_alert(NULL,
-"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
- error);
- return error;
-}
-
-static void
-xfs_cpu_hotplug_destroy(void)
-{
- cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
-}
-
-#else /* !CONFIG_HOTPLUG_CPU */
-static inline int xfs_cpu_hotplug_init(void) { return 0; }
-static inline void xfs_cpu_hotplug_destroy(void) {}
-#endif
-
STATIC int __init
init_xfs_fs(void)
{
@@ -2329,13 +2310,9 @@ init_xfs_fs(void)
xfs_dir_startup();
- error = xfs_cpu_hotplug_init();
- if (error)
- goto out;
-
error = xfs_init_caches();
if (error)
- goto out_destroy_hp;
+ goto out;
error = xfs_init_workqueues();
if (error)
@@ -2353,10 +2330,12 @@ init_xfs_fs(void)
if (error)
goto out_cleanup_procfs;
+ xfs_debugfs = xfs_debugfs_mkdir("xfs", NULL);
+
xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
if (!xfs_kset) {
error = -ENOMEM;
- goto out_sysctl_unregister;
+ goto out_debugfs_unregister;
}
xfsstats.xs_kobj.kobject.kset = xfs_kset;
@@ -2372,11 +2351,15 @@ init_xfs_fs(void)
if (error)
goto out_free_stats;
+ error = xchk_global_stats_setup(xfs_debugfs);
+ if (error)
+ goto out_remove_stats_kobj;
+
#ifdef DEBUG
xfs_dbg_kobj.kobject.kset = xfs_kset;
error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
if (error)
- goto out_remove_stats_kobj;
+ goto out_remove_scrub_stats;
#endif
error = xfs_qm_init();
@@ -2393,14 +2376,17 @@ init_xfs_fs(void)
out_remove_dbg_kobj:
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
- out_remove_stats_kobj:
+ out_remove_scrub_stats:
#endif
+ xchk_global_stats_teardown();
+ out_remove_stats_kobj:
xfs_sysfs_del(&xfsstats.xs_kobj);
out_free_stats:
free_percpu(xfsstats.xs_stats);
out_kset_unregister:
kset_unregister(xfs_kset);
- out_sysctl_unregister:
+ out_debugfs_unregister:
+ debugfs_remove(xfs_debugfs);
xfs_sysctl_unregister();
out_cleanup_procfs:
xfs_cleanup_procfs();
@@ -2410,8 +2396,6 @@ init_xfs_fs(void)
xfs_destroy_workqueues();
out_destroy_caches:
xfs_destroy_caches();
- out_destroy_hp:
- xfs_cpu_hotplug_destroy();
out:
return error;
}
@@ -2424,16 +2408,17 @@ exit_xfs_fs(void)
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
#endif
+ xchk_global_stats_teardown();
xfs_sysfs_del(&xfsstats.xs_kobj);
free_percpu(xfsstats.xs_stats);
kset_unregister(xfs_kset);
+ debugfs_remove(xfs_debugfs);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_caches();
xfs_uuid_table_free();
- xfs_cpu_hotplug_destroy();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 364e2c2648a8..302e6e5d6c7e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -100,4 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
+struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent);
+
#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index f3cc204bb4bf..3926cf7f2a6e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -22,6 +22,9 @@
* daddr: physical block number in 512b blocks
* bbcount: number of blocks in a physical extent, in 512b blocks
*
+ * rtx: physical rt extent number for extent mappings
+ * rtxcount: number of rt extents in an extent mapping
+ *
* owner: reverse-mapping owner, usually inodes
*
* fileoff: file offset, in fs blocks
@@ -802,36 +805,28 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating);
* ring buffer. Somehow this was only worth mentioning in the ftrace sample
* code.
*/
-TRACE_DEFINE_ENUM(PE_SIZE_PTE);
-TRACE_DEFINE_ENUM(PE_SIZE_PMD);
-TRACE_DEFINE_ENUM(PE_SIZE_PUD);
-
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
TRACE_EVENT(xfs_filemap_fault,
- TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
- bool write_fault),
- TP_ARGS(ip, pe_size, write_fault),
+ TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault),
+ TP_ARGS(ip, order, write_fault),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(enum page_entry_size, pe_size)
+ __field(unsigned int, order)
__field(bool, write_fault)
),
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->pe_size = pe_size;
+ __entry->order = order;
__entry->write_fault = write_fault;
),
- TP_printk("dev %d:%d ino 0x%llx %s write_fault %d",
+ TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
- __print_symbolic(__entry->pe_size,
- { PE_SIZE_PTE, "PTE" },
- { PE_SIZE_PMD, "PMD" },
- { PE_SIZE_PUD, "PUD" }),
+ __entry->order,
__entry->write_fault)
)
@@ -3829,6 +3824,51 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
__entry->new_ptr)
);
+TRACE_EVENT(xfs_iunlink_reload_next,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ __field(xfs_agino_t, prev_agino)
+ __field(xfs_agino_t, next_agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ __entry->prev_agino = ip->i_prev_unlinked;
+ __entry->next_agino = ip->i_next_unlinked;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->prev_agino,
+ __entry->next_agino)
+);
+
+TRACE_EVENT(xfs_inode_reload_unlinked_bucket,
+ TP_PROTO(struct xfs_inode *ip),
+ TP_ARGS(ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, agino)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
+ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agino,
+ __entry->agino % XFS_AGI_UNLINKED_BUCKETS)
+);
+
DECLARE_EVENT_CLASS(xfs_ag_inode_class,
TP_PROTO(struct xfs_inode *ip),
TP_ARGS(ip),
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 43e5c219aaed..a3975f325f4e 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -46,6 +46,17 @@ xfs_attr_grab_log_assist(
if (xfs_sb_version_haslogxattrs(&mp->m_sb))
return 0;
+ /*
+ * Check if the filesystem featureset is new enough to set this log
+ * incompat feature bit. Strictly speaking, the minimum requirement is
+ * a V5 filesystem for the superblock field, but we'll require rmap
+ * or reflink to avoid having to deal with really old kernels.
+ */
+ if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) {
+ error = -EOPNOTSUPP;
+ goto drop_incompat;
+ }
+
/* Enable log-assisted xattrs. */
error = xfs_add_incompat_log_feature(mp,
XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 789cfb74c146..b2c9b35df8f7 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = {
.read_folio = zonefs_read_folio,
.readahead = zonefs_readahead,
.writepages = zonefs_writepages,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 9350221abfc5..9d1a9808fbbb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -658,7 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir,
inode->i_ino = ino;
inode->i_mode = z->z_mode;
- inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
+ inode_get_ctime(dir));
inode->i_uid = z->z_uid;
inode->i_gid = z->z_gid;
inode->i_size = z->z_wpoffset;
@@ -694,7 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb,
inode->i_ino = ino;
inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555);
inode->i_size = sbi->s_zgroup[ztype].g_nr_zones;
- inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime;
+ inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode,
+ inode_get_ctime(root));
inode->i_private = &sbi->s_zgroup[ztype];
set_nlink(inode, 2);
@@ -1317,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
inode->i_ino = bdev_nr_zones(sb->s_bdev);
inode->i_mode = S_IFDIR | 0555;
- inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode);
+ inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode);
inode->i_op = &zonefs_dir_inode_operations;
inode->i_fop = &zonefs_dir_operations;
inode->i_size = 2;